From 8c24db5fd6e0d0fb07136e8f4ac2654f23b59310 Mon Sep 17 00:00:00 2001 From: lismana Date: Wed, 12 Jun 2024 23:36:42 -0400 Subject: [PATCH] Fix clinical data counts query/filter using new derived table --- .../StudyViewMyBatisRepository.java | 5 +- .../db-scripts/clickhouse/derived_tables.sql | 46 ++++++++ .../clickhouse/materialized_views.sql | 100 +++++++++++++++--- .../resources/db-scripts/clickhouse/views.sql | 51 --------- .../StudyViewFilterMapper.xml | 15 ++- .../mybatisclickhouse/StudyViewMapper.xml | 82 ++++++++++++-- 6 files changed, 215 insertions(+), 84 deletions(-) create mode 100644 src/main/resources/db-scripts/clickhouse/derived_tables.sql diff --git a/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java b/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java index c723806885b..b227d71f23e 100644 --- a/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java +++ b/src/main/java/org/cbioportal/persistence/mybatisclickhouse/StudyViewMyBatisRepository.java @@ -13,13 +13,14 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Repository; +import java.util.Collections; import java.util.List; import java.util.Map; @Repository public class StudyViewMyBatisRepository implements StudyViewRepository { - private static final List FILTERED_CLINICAL_ATTR_VALUES = List.of("NA", "NAN", "N/A"); + private static final List FILTERED_CLINICAL_ATTR_VALUES = Collections.emptyList(); private final StudyViewMapper mapper; @Autowired @@ -59,7 +60,7 @@ public List getPatientClinicalDataCounts(StudyViewFilter stud @Override public List getClinicalDataAttributeNames(ClinicalAttributeDataSource clinicalAttributeDataSource, ClinicalAttributeDataType dataType) { - String tableName = clinicalAttributeDataSource.getValue().toLowerCase() + "_clinical_attribute_" + dataType.getValue().toLowerCase() + "_view"; + String tableName = clinicalAttributeDataSource.getValue().toLowerCase() + "_clinical_attribute_" + dataType.getValue().toLowerCase() + "_mv"; return mapper.getClinicalAttributeNames(tableName); } diff --git a/src/main/resources/db-scripts/clickhouse/derived_tables.sql b/src/main/resources/db-scripts/clickhouse/derived_tables.sql new file mode 100644 index 00000000000..3b2ffbcd17c --- /dev/null +++ b/src/main/resources/db-scripts/clickhouse/derived_tables.sql @@ -0,0 +1,46 @@ +-- clinical_data_derived +DROP TABLE IF EXISTS clinical_data_derived +CREATE TABLE IF NOT EXISTS clinical_data_derived +( + sample_unique_id String, + patient_unique_id String, + attribute_name String, + attribute_value String, + cancer_study_identifier String, + type String +) + ENGINE=MergeTree + ORDER BY sample_unique_id + +-- Insert sample attribute data +INSERT INTO TABLE clinical_data_derived +SELECT sm.sample_unique_id AS sample_unique_id, + sm.patient_unique_id AS patient_unique_id, + cam.attr_id AS attribute_name, + csamp.attr_value AS attribute_value, + cs.cancer_study_identifier AS cancer_study_identifier, + 'sample' AS type +FROM sling_db_2024_05_23_original.sample_mv AS sm + INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs + ON sm.cancer_study_identifier = cs.cancer_study_identifier + FULL OUTER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam + ON cs.cancer_study_id = cam.cancer_study_id + FULL OUTER JOIN sling_db_2024_05_23_original.clinical_sample AS csamp + ON (sm.internal_id = csamp.internal_id) AND (csamp.attr_id = cam.attr_id) +WHERE cam.patient_attribute = 0 + +-- INSERT patient attribute data +INSERT INTO TABLE clinical_data_derived +SELECT NULL AS sample_unique_id, + concat(cs.cancer_study_identifier, '_', p.stable_id) AS patient_unique_id, + cam.attr_id AS attribute_name, + clinpat.attr_value AS attribute_value, + cs.cancer_study_identifier AS cancer_study_identifier, + 'patient' AS type +FROM sling_db_2024_05_23_original.patient AS p + INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs ON p.cancer_study_id = cs.cancer_study_id + FULL OUTER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam + ON cs.cancer_study_id = cam.cancer_study_id + FULL OUTER JOIN sling_db_2024_05_23_original.clinical_patient AS clinpat + ON (p.internal_id = clinpat.internal_id) AND (clinpat.attr_id = cam.attr_id) +WHERE cam.patient_attribute = 1 diff --git a/src/main/resources/db-scripts/clickhouse/materialized_views.sql b/src/main/resources/db-scripts/clickhouse/materialized_views.sql index 6704ecd9bad..c5168ae4b14 100644 --- a/src/main/resources/db-scripts/clickhouse/materialized_views.sql +++ b/src/main/resources/db-scripts/clickhouse/materialized_views.sql @@ -1,14 +1,6 @@ -DROP TABLE IF EXISTS sample_clinical_attribute_numeric; -DROP TABLE IF EXISTS sample_clinical_attribute_categorical; -DROP TABLE IF EXISTS patient_clinical_attribute_numeric; -DROP TABLE IF EXISTS patient_clinical_attribute_categorical; DROP TABLE IF EXISTS sample_columnstore; DROP TABLE IF EXISTS sample_list_columnstore; DROP TABLE IF EXISTS genomic_event; -DROP VIEW IF EXISTS sample_clinical_attribute_numeric_mv; -DROP VIEW IF EXISTS sample_clinical_attribute_categorical_mv; -DROP VIEW IF EXISTS patient_clinical_attribute_numeric_mv; -DROP VIEW IF EXISTS patient_clinical_attribute_categorical_mv; DROP VIEW IF EXISTS sample_columnstore_mv; DROP VIEW IF EXISTS sample_list_columnstore_mv; @@ -135,17 +127,91 @@ FROM cancer_study cs INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id WHERE NOT match(cp.attr_value, '^[\d\.]+$'); +--patient_clinical_attribute_categorical_mv +DROP VIEW IF EXISTS patient_clinical_attribute_categorical_mv; CREATE MATERIALIZED VIEW patient_clinical_attribute_categorical_mv - TO patient_clinical_attribute_categorical AS -SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id, - cp.attr_id as attribute_name, - cp.attr_value as attribute_value, - cs.cancer_study_identifier as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id - INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id -WHERE NOT match(cp.attr_value, '^[\d\.]+$'); + ENGINE = MergeTree() + ORDER BY cancer_study_identifier + SETTINGS allow_nullable_key = 1 + POPULATE +AS +SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) AS patient_unique_id, + cp.attr_id AS attribute_name, + cp.attr_value AS attribute_value, + cs.cancer_study_identifier AS cancer_study_identifier +FROM clinical_patient AS cp + INNER JOIN patient AS p ON cp.internal_id = p.internal_id + INNER JOIN cancer_study AS cs ON p.cancer_study_id = cs.cancer_study_id + INNER JOIN clinical_attribute_meta AS cam + ON (cp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id) +WHERE cam.datatype = 'STRING' + + +--patient_clinical_attribute_numeric_mv +DROP VIEW IF EXISTS patient_clinical_attribute_numeric_mv; +CREATE MATERIALIZED VIEW patient_clinical_attribute_numeric_mv + ENGINE = MergeTree() + ORDER BY cancer_study_identifier + SETTINGS allow_nullable_key = 1 + POPULATE +AS +SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) AS patient_unique_id, + cp.attr_id AS attribute_name, + cp.attr_value AS attribute_value, + cs.cancer_study_identifier AS cancer_study_identifier +FROM sling_db_2024_05_23_original.clinical_patient AS cp + INNER JOIN sling_db_2024_05_23_original.patient AS p ON cp.internal_id = p.internal_id + INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs ON p.cancer_study_id = cs.cancer_study_id + INNER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam + ON (cp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id) +WHERE cam.datatype = 'NUMBER' + +-- sample_clinical_attribute_categorical_mv +DROP VIEW IF EXISTS sample_clinical_attribute_categorical_mv; +CREATE MATERIALIZED VIEW sample_clinical_attribute_categorical_mv + ENGINE = MergeTree() + ORDER BY cancer_study_identifier + SETTINGS allow_nullable_key = 1 + POPULATE +AS +SELECT s.sample_unique_id, + s.patient_unique_id, + csamp.attr_id AS attribute_name, + csamp.attr_value AS attribute_value, + s.cancer_study_identifier AS cancer_study_identifier +FROM sling_db_2024_05_23_original.clinical_sample AS csamp + INNER JOIN sling_db_2024_05_23_original.sample_mv AS s ON csamp.internal_id = s.internal_id + INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs + ON s.cancer_study_identifier = cs.cancer_study_identifier + INNER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam + ON (csamp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id) +WHERE cam.datatype = 'STRING' + +-- sample_clinical_attribute_numeric_mv +DROP VIEW IF EXISTS sample_clinical_attribute_numeric_mv; +CREATE MATERIALIZED VIEW sample_clinical_attribute_numeric_mv + ENGINE = MergeTree() + ORDER BY cancer_study_identifier + SETTINGS allow_nullable_key = 1 + POPULATE +AS +SELECT s.sample_unique_id, + s.patient_unique_id, + csamp.attr_id AS attribute_name, + csamp.attr_value AS attribute_value, + s.cancer_study_identifier AS cancer_study_identifier +FROM clinical_sample AS csamp + INNER JOIN sample_mv AS s ON csamp.internal_id = s.internal_id + INNER JOIN cancer_study AS cs + ON s.cancer_study_identifier = cs.cancer_study_identifier + INNER JOIN clinical_attribute_meta AS cam + ON (csamp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id) +WHERE cam.datatype = 'NUMBER' + + + +-- sample_columnstore CREATE TABLE IF NOT EXISTS sample_columnstore ( sample_unique_id VARCHAR(45), diff --git a/src/main/resources/db-scripts/clickhouse/views.sql b/src/main/resources/db-scripts/clickhouse/views.sql index 60d6c5f8388..27845d1052b 100644 --- a/src/main/resources/db-scripts/clickhouse/views.sql +++ b/src/main/resources/db-scripts/clickhouse/views.sql @@ -1,57 +1,6 @@ -DROP VIEW IF EXISTS sample_clinical_attribute_numeric_view; -DROP VIEW IF EXISTS sample_clinical_attribute_categorical_view; -DROP VIEW IF EXISTS patient_clinical_attribute_numeric_view; -DROP VIEW IF EXISTS patient_clinical_attribute_categorical_view; DROP VIEW IF EXISTS sample_view; DROP VIEW IF EXISTS sample_list_view; -CREATE VIEW sample_clinical_attribute_numeric_view - AS -SELECT concat(cs.cancer_study_identifier, '_', s.stable_id) as sample_unique_id, - concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id, - clinical_sample.attr_id as attribute_name, - cast(clinical_sample.attr_value as float) as attribute_value, - cs.cancer_study_identifier as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id - INNER JOIN sample s on p.internal_id = s.patient_id - INNER JOIN clinical_sample ON s.internal_id = clinical_sample.internal_id -WHERE match(clinical_sample.attr_value, '^[\d\.]+$'); - -CREATE VIEW sample_clinical_attribute_categorical_view - AS -SELECT concat(cs.cancer_study_identifier, '_', s.stable_id) as sample_unique_id, - concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id, - cl.attr_id as attribute_name, - cl.attr_value as attribute_value, - cs.cancer_study_identifier as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id - INNER JOIN sample s on p.internal_id = s.patient_id - INNER JOIN clinical_sample cl on s.internal_id = cl.internal_id -WHERE NOT match(cl.attr_value, '^[\d\.]+$'); - -CREATE VIEW patient_clinical_attribute_numeric_view - AS -SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id, - cp.attr_id as attribute_name, - cast(cp.attr_value as float) as attribute_value, - cs.cancer_study_identifier as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id - INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id -WHERE match(cp.attr_value, '^[\d\.]+$'); - -CREATE VIEW patient_clinical_attribute_categorical_view - AS -SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id, - cp.attr_id as attribute_name, - cp.attr_value as attribute_value, - cs.cancer_study_identifier as cancer_study_identifier -FROM cancer_study cs - INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id - INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id -WHERE NOT match(cp.attr_value, '^[\d\.]+$'); CREATE VIEW sample_view AS diff --git a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml index 5ed126acaa6..046f37afcfd 100644 --- a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml +++ b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewFilterMapper.xml @@ -46,7 +46,8 @@ - + + @@ -54,7 +55,8 @@ - + + @@ -68,7 +70,8 @@ - + + @@ -76,7 +79,8 @@ - + + @@ -112,7 +116,8 @@ SELECT ${unique_id} FROM ${table_name} - WHERE attribute_name = '${clinicalDataFilter.attributeId}' + WHERE attribute_name = '${clinicalDataFilter.attributeId}' AND + type='${type}' AND attribute_value = '${dataFilterValue.value}' diff --git a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml index d177d4a5892..8e491a54296 100644 --- a/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml +++ b/src/main/resources/org/cbioportal/persistence/mybatisclickhouse/StudyViewMapper.xml @@ -52,7 +52,7 @@ attribute_name as attrId, attribute_value as attrValue, cancer_study_identifier as studyId - FROM sample_clinical_attribute_numeric_view + FROM sample_clinical_attribute_numeric_mv sample_unique_id IN ( @@ -76,7 +76,7 @@ attribute_name as attrId, attribute_value as attrValue, cancer_study_identifier as studyId - FROM patient_clinical_attribute_numeric_view + FROM patient_clinical_attribute_numeric_mv patient_unique_id IN ( @@ -95,13 +95,13 @@ @@ -111,8 +111,7 @@ Count(*) as count FROM ${table_name_prefix}_clinical_attribute_categorical_view - sample_unique_id IN ( - + patient_unique_id IN ( INTERSECT @@ -129,6 +128,71 @@ GROUP BY attribute_name, attribute_value + + + + + + + + SELECT + attribute_name as attributeId, + if(attribute_value='', 'NA', attribute_value) AS value, + count(value) as count + FROM clinical_data_derived + + type='sample' AND + sample_unique_id IN ( + + INTERSECT + SELECT sample_unique_id from sample_mv WHERE sample_mv.patient_unique_id IN + () + + ) + + AND UPPER(value) NOT IN + + #{filteredAttributeValue} + + + AND attribute_name IN + + #{attributeId} + + + GROUP BY attribute_name, + value + + + + SELECT + attribute_name as attributeId, + if(attribute_value='', 'NA', attribute_value) AS value, + count(value) as count + FROM clinical_data_derived + + type='patient' AND + patient_unique_id IN ( + + INTERSECT + + ) + + AND UPPER(value) NOT IN + + #{filteredAttributeValue} + + + AND attribute_name IN + + #{attributeId} + + + GROUP BY attribute_name, + value + + +