diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index 4e5673d9ec..fcf0eb55a2 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -8,15 +8,17 @@ from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef +from django.db.models import Value from search.constants import CONTENTNODE_AUTHOR_TSVECTOR from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.models import ContentNodeFullTextSearch +from contentcuration.models import Channel from contentcuration.models import ContentNode logmodule.basicConfig(level=logmodule.INFO) -logging = logmodule.getLogger("command") +logging = logmodule.getLogger(__name__) CHUNKSIZE = 10000 @@ -26,34 +28,40 @@ class Command(BaseCommand): def handle(self, *args, **options): start = time.time() - tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + all_published_channels = list(Channel.objects.filter(main_tree__published=True, deleted=False).values("id", "main_tree__tree_id")) - tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) - .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), - keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, - author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) - .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) - - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) total_tsvectors_inserted = 0 - while insertable_nodes_tsvector: - logging.info("Inserting contentnode tsvectors.") + for published_channel in all_published_channels: + tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + tsvector_nodes_query = (ContentNode.objects + .annotate(channel_id=Value(published_channel["id"]), + contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) + .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"], published=True, complete=True) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector") + .order_by()) + + insertable_nodes_tsvector = list(tsvector_nodes_query[:CHUNKSIZE]) + logging.info("Inserting contentnode tsvectors of channel {}.".format(published_channel["id"])) + + while insertable_nodes_tsvector: + insert_objs = list() + for node in insertable_nodes_tsvector: + obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], + keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) + insert_objs.append(obj) - insert_objs = list() - for node in insertable_nodes_tsvector: - obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], - keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) - insert_objs.append(obj) + inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) - inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) + current_inserts_count = len(inserted_objs_list) + total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count - current_inserts_count = len(inserted_objs_list) - total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count + logging.info("Inserted {} contentnode tsvectors of channel {}.".format(current_inserts_count, published_channel["id"])) - logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) + insertable_nodes_tsvector = list(tsvector_nodes_query[:CHUNKSIZE]) - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + logging.info("Insertion complete for channel {}.".format(published_channel["id"])) logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start))