From 0c2006360f76d783b59b013c580ed2995e27131d Mon Sep 17 00:00:00 2001
From: Vivek Agrawal <vivekmittalagrawal@gmail.com>
Date: Tue, 20 Dec 2022 19:08:47 +0530
Subject: [PATCH 1/2] =?UTF-8?q?Optimized=20tsvectors=20insertion=20?=
 =?UTF-8?q?=F0=9F=9A=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../commands/set_contentnode_tsvectors.py     | 52 +++++++++++--------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py
index 4e5673d9ec..4a901a055c 100644
--- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py
+++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py
@@ -8,15 +8,17 @@
 from django.core.management.base import BaseCommand
 from django.db.models import Exists
 from django.db.models import OuterRef
+from django.db.models import Value
 from search.constants import CONTENTNODE_AUTHOR_TSVECTOR
 from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR
 from search.models import ContentNodeFullTextSearch
 
+from contentcuration.models import Channel
 from contentcuration.models import ContentNode
 
 
 logmodule.basicConfig(level=logmodule.INFO)
-logging = logmodule.getLogger("command")
+logging = logmodule.getLogger(__name__)
 
 CHUNKSIZE = 10000
 
@@ -26,34 +28,40 @@ class Command(BaseCommand):
     def handle(self, *args, **options):
         start = time.time()
 
-        tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id")))
+        all_published_channels = list(Channel.objects.filter(main_tree__published=True, deleted=False).values("id", "main_tree__tree_id"))
 
-        tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects)
-                               .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
-                                         keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
-                                         author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
-                               .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False)
-                               .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by())
-
-        insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE])
         total_tsvectors_inserted = 0
 
-        while insertable_nodes_tsvector:
-            logging.info("Inserting contentnode tsvectors.")
+        for published_channel in all_published_channels:
+            tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id")))
+            tsvector_nodes_query = (ContentNode.objects
+                                    .annotate(channel_id=Value(published_channel["id"]),
+                                              contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
+                                              keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
+                                              author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
+                                    .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"])
+                                    .values("id", "channel_id", "keywords_tsvector", "author_tsvector")
+                                    .order_by())
+
+            insertable_nodes_tsvector = list(tsvector_nodes_query[:CHUNKSIZE])
+            logging.info("Inserting contentnode tsvectors of channel {}.".format(published_channel["id"]))
+
+            while insertable_nodes_tsvector:
+                insert_objs = list()
+                for node in insertable_nodes_tsvector:
+                    obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"],
+                                                    keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"])
+                    insert_objs.append(obj)
 
-            insert_objs = list()
-            for node in insertable_nodes_tsvector:
-                obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"],
-                                                keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"])
-                insert_objs.append(obj)
+                inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs)
 
-            inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs)
+                current_inserts_count = len(inserted_objs_list)
+                total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count
 
-            current_inserts_count = len(inserted_objs_list)
-            total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count
+                logging.info("Inserted {} contentnode tsvectors of channel {}.".format(current_inserts_count, published_channel["id"]))
 
-            logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count))
+                insertable_nodes_tsvector = list(tsvector_nodes_query[:CHUNKSIZE])
 
-            insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE])
+            logging.info("Insertion complete for channel {}.".format(published_channel["id"]))
 
         logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start))

From ec99d4765c7805d6a6af8493595b39333a0f11e3 Mon Sep 17 00:00:00 2001
From: Vivek Agrawal <vivekmittalagrawal@gmail.com>
Date: Tue, 27 Dec 2022 21:03:17 +0530
Subject: [PATCH 2/2] Don't create tsvectors for incomplete and unpublished
 nodes

---
 .../search/management/commands/set_contentnode_tsvectors.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py
index 4a901a055c..fcf0eb55a2 100644
--- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py
+++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py
@@ -39,7 +39,7 @@ def handle(self, *args, **options):
                                               contentnode_tags=StringAgg("tags__tag_name", delimiter=" "),
                                               keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR,
                                               author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR)
-                                    .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"])
+                                    .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"], published=True, complete=True)
                                     .values("id", "channel_id", "keywords_tsvector", "author_tsvector")
                                     .order_by())