Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Creation of metadata labels model #3288

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions contentcuration/contentcuration/constants/le_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
This module contains constants representing the id of the labels.

This file should be removed and its content replaced by loading the labels from
le-utils package after this package contains the definitive list of labels and their ids
"""
from __future__ import unicode_literals

# Grade Level
UPPER_PRIMARY = "upperprimary"
SPECIALIZED_PROFESSIONAL_TRAINING = "6a256feb4d882f5d90b862f83e789166"
SECONDARY = "secondary"
GRADE_LEVEL = (UPPER_PRIMARY, SPECIALIZED_PROFESSIONAL_TRAINING, SECONDARY)

# Resource Type
GAME = "game"
BOOK = "book"
VIDEO = "video"
AUDIO = "audio"
RESOURCE_TYPE = (GAME, BOOK, VIDEO, AUDIO)

# Learning Activity
PLAY = "0aacea8079e044618e297efc2486594f"
READ = "bfbff1a273e1b9c84a2d3dfa4af948fd"
LISTEN = "363e0d138291774124d13c5d309f0e94"
WATCH = "e7e6184e49758c82ba7b601b1b90af09"
LEARNING_ACTIVITY = (PLAY, READ, LISTEN, WATCH)

# Accessibitility
HIGH_CONTRAST_DISPLAY = "d6bada40219e36b7654715d8c5553aa0"
SIGN_LANGUAGE_CAPTIONS = "signlanguage"
ACCESIBILITY = (HIGH_CONTRAST_DISPLAY, SIGN_LANGUAGE_CAPTIONS)

# Category
FOR_SCHOOL = "for_school"
NON_FORMAL = "nonformal"
BASIC_SKILLS = "basicskills"
MATH = "{}.math".format(FOR_SCHOOL)
SCIENCES = "{}.sciences".format(FOR_SCHOOL)
BIOLOGY = "{}.biology".format(SCIENCES)
CALCULUS = "{}.calculus".format(MATH)
ALGEBRA = "{}.algebra".format(MATH)
ROBOTICS = "{}.robotics".format(NON_FORMAL)
DIGITAL_LITERACY = "{}.digitalliteracy".format(BASIC_SKILLS)
CATEGORY = (
FOR_SCHOOL,
NON_FORMAL,
BASIC_SKILLS,
MATH,
SCIENCES,
BIOLOGY,
CALCULUS,
ALGEBRA,
ROBOTICS,
DIGITAL_LITERACY,
)

MATH_ARRAY = (MATH, CALCULUS, ALGEBRA)
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Generated by Django 3.2.4 on 2021-09-27 18:20
from django.db import migrations
from django.db import models


class Migration(migrations.Migration):

dependencies = [
("contentcuration", "0132_auto_20210708_0011"),
]

operations = [
migrations.AddField(
model_name="contentnode",
name="accessibility_labels",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="contentnode",
name="category_labels",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="contentnode",
name="grade_level_labels",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="contentnode",
name="learning_activity_labels",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="contentnode",
name="resource_type_labels",
field=models.TextField(blank=True, null=True),
),
migrations.AddIndex(
model_name="contentnode",
index=models.Index(
fields=["grade_level_labels"], name="grade_level_label_idx"
),
),
migrations.AddIndex(
model_name="contentnode",
index=models.Index(
fields=["resource_type_labels"], name="resource_type_label_idx"
),
),
migrations.AddIndex(
model_name="contentnode",
index=models.Index(
fields=["learning_activity_labels"], name="learning_activity_label_idx"
),
),
migrations.AddIndex(
model_name="contentnode",
index=models.Index(fields=["category_labels"], name="category_label_idx"),
),
]
18 changes: 18 additions & 0 deletions contentcuration/contentcuration/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,10 @@ def __str__(self):
NODE_ID_INDEX_NAME = "node_id_idx"
NODE_MODIFIED_INDEX_NAME = "node_modified_idx"
NODE_MODIFIED_DESC_INDEX_NAME = "node_modified_desc_idx"
GRADE_LEVEL_LABEL_INDEX = "grade_level_label_idx"
RESOURCE_TYPE_LABEL_INDEX = "resource_type_label_idx"
LEARNING_ACTIVITY_LABEL_INDEX = "learning_activity_label_idx"
CATEGORY_LABEL_INDEX = "category_label_idx"


class ContentNode(MPTTModel, models.Model):
Expand Down Expand Up @@ -1146,6 +1150,12 @@ class ContentNode(MPTTModel, models.Model):
role_visibility = models.CharField(max_length=50, choices=roles.choices, default=roles.LEARNER)
freeze_authoring_data = models.BooleanField(default=False)

grade_level_labels = models.TextField(blank=True, null=True)
resource_type_labels = models.TextField(blank=True, null=True)
learning_activity_labels = models.TextField(blank=True, null=True)
accessibility_labels = models.TextField(blank=True, null=True)
category_labels = models.TextField(blank=True, null=True)

objects = CustomContentNodeTreeManager()

# Track all updates and ignore a blacklist of attributes
Expand Down Expand Up @@ -1745,6 +1755,14 @@ class Meta:
indexes = [
models.Index(fields=["node_id"], name=NODE_ID_INDEX_NAME),
models.Index(fields=["-modified"], name=NODE_MODIFIED_DESC_INDEX_NAME),
models.Index(fields=["grade_level_labels"], name=GRADE_LEVEL_LABEL_INDEX),
models.Index(
fields=["resource_type_labels"], name=RESOURCE_TYPE_LABEL_INDEX
),
models.Index(
fields=["learning_activity_labels"], name=LEARNING_ACTIVITY_LABEL_INDEX
),
models.Index(fields=["category_labels"], name=CATEGORY_LABEL_INDEX),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you remind me, did you find any benefit with these indices? If we go with a bitmask approach for searching, then perhaps we can avoid adding these and prevent the performance hit on writing to the table.

]


Expand Down
139 changes: 139 additions & 0 deletions contentcuration/contentcuration/tests/test_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import logging
import os
from random import randint
from random import shuffle
from time import time

import pytest

from .base import BaseTestCase
from contentcuration.constants.le_labels import ACCESIBILITY
from contentcuration.constants.le_labels import CALCULUS
from contentcuration.constants.le_labels import CATEGORY
from contentcuration.constants.le_labels import GRADE_LEVEL
from contentcuration.constants.le_labels import LEARNING_ACTIVITY
from contentcuration.constants.le_labels import MATH
from contentcuration.constants.le_labels import RESOURCE_TYPE
from contentcuration.constants.le_labels import VIDEO
from contentcuration.models import ContentNode
from contentcuration.utils.db_tools import TreeBuilder

# from faker import Faker
# from .testdata import topic

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


def _random_labels(labels, length_labels):
amount = randint(0, length_labels + 1)
if amount:
shuffle(labels)
return labels[:amount]
return []


def assign_labels():
nodes = ContentNode.objects.all()
grades, length_grades = list(GRADE_LEVEL), len(GRADE_LEVEL)
resources, length_resources = list(RESOURCE_TYPE), len(RESOURCE_TYPE)
activities, length_activities = list(LEARNING_ACTIVITY), len(LEARNING_ACTIVITY)
accesibility, length_accessibility = list(ACCESIBILITY), len(ACCESIBILITY)
categories, length_categories = list(CATEGORY), len(CATEGORY)
for node in nodes:
grade = _random_labels(grades, length_grades)
res = _random_labels(resources, length_resources)
acts = _random_labels(activities, length_activities)
accs = _random_labels(accesibility, length_accessibility)
cats = _random_labels(categories, length_categories)
node.grade_level_labels = ",".join(grade)
node.resource_type_labels = ",".join(res)
node.learning_activity_labels = ",".join(acts)
node.accessibility_labels = ",".join(accs)
node.category_labels = ",".join(cats)
node.save()


class NodesLabelsTestCase(BaseTestCase):
def setUp(self):
# creates node hierarchy according to
# contentcuration/contentcuration/tests/fixtures/tree.json
super(NodesLabelsTestCase, self).setUp()
assign_labels()
self.node_query = ContentNode.objects.filter(title__icontains="Topic")

def test_nodes_of_a_label(self):
"""
Get all ContentNodes with a label or one of its descendant labels
"""
maths = ContentNode.objects.filter(resource_type_labels__contains=VIDEO)
calculus = ContentNode.objects.filter(resource_type_labels__contains=CALCULUS)
assert len(maths) >= len(calculus)


@pytest.fixture(scope="class")
def create_many_nodes():
print("Creating nodes")
TreeBuilder(levels=3, num_children=10)


@pytest.mark.skipif(
os.environ.get("LABELS_MASSIVE", "false") != "true",
reason="Env variable to run massive test is not set",
)
@pytest.mark.usefixtures("create_many_nodes")
class LabelsMassiveTestCase(BaseTestCase):
"""
To run this class tests, pytest must be launched with
LABELS_MASSIVE=true pytest -s contentcuration/contentcuration/tests/test_labels.py::LabelsMassiveTestCase
"""

def setUp(self):
self.elapsed = 0
self.nodes = ContentNode.objects.all()
self.records = len(self.nodes)
print("{} nodes created".format(self.records))

def test_massive_str(self):
init_time = time()
grades, length_grades = list(GRADE_LEVEL), len(GRADE_LEVEL)
resources, length_resources = list(RESOURCE_TYPE), len(RESOURCE_TYPE)
activities, length_activities = list(LEARNING_ACTIVITY), len(LEARNING_ACTIVITY)
accesibility, length_accessibility = list(ACCESIBILITY), len(ACCESIBILITY)
categories, length_categories = list(CATEGORY), len(CATEGORY)
for node in self.nodes:
grade = _random_labels(grades, length_grades)
res = _random_labels(resources, length_resources)
acts = _random_labels(activities, length_activities)
accs = _random_labels(accesibility, length_accessibility)
cats = _random_labels(categories, length_categories)
node.grade_level_labels = ",".join(grade)
node.resource_type_labels = ",".join(res)
node.learning_activity_labels = ",".join(acts)
node.accessibility_labels = ",".join(accs)
node.category_labels = ",".join(cats)
node.save()
self.elapsed = time() - init_time
print(
"USING STRING: Assigning random labels to {} nodes took {} seconds".format(
self.records, self.elapsed
)
)

init_time = time()
strings = len(ContentNode.objects.filter(resource_type_labels__contains=VIDEO))
self.elapsed = time() - init_time
print(
"USING STRING: Finding for resource label in {} nodes took {} seconds".format(
strings, self.elapsed
)
)

init_time = time()
strings = len(ContentNode.objects.filter(category_labels__contains=MATH))
self.elapsed = time() - init_time
print(
"USING STRING: Finding for maths and descendants label in {} nodes took {} seconds".format(
strings, self.elapsed
)
)