Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix clinical data counts (use new view that materializes null clinical data entrees) #10833

Merged
merged 1 commit into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;

import java.util.Collections;
import java.util.List;
import java.util.Map;

@Repository
public class StudyViewMyBatisRepository implements StudyViewRepository {

private static final List<String> FILTERED_CLINICAL_ATTR_VALUES = List.of("NA", "NAN", "N/A");
private static final List<String> FILTERED_CLINICAL_ATTR_VALUES = Collections.emptyList();
private final StudyViewMapper mapper;

@Autowired
Expand Down Expand Up @@ -59,7 +60,7 @@ public List<ClinicalDataCount> getPatientClinicalDataCounts(StudyViewFilter stud

@Override
public List<String> getClinicalDataAttributeNames(ClinicalAttributeDataSource clinicalAttributeDataSource, ClinicalAttributeDataType dataType) {
String tableName = clinicalAttributeDataSource.getValue().toLowerCase() + "_clinical_attribute_" + dataType.getValue().toLowerCase() + "_view";
String tableName = clinicalAttributeDataSource.getValue().toLowerCase() + "_clinical_attribute_" + dataType.getValue().toLowerCase() + "_mv";
return mapper.getClinicalAttributeNames(tableName);
}

Expand Down
46 changes: 46 additions & 0 deletions src/main/resources/db-scripts/clickhouse/derived_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
-- clinical_data_derived
DROP TABLE IF EXISTS clinical_data_derived
CREATE TABLE IF NOT EXISTS clinical_data_derived
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to have this convention. Where we define the tables and the insert statements in one file? or the split file approach?

(
sample_unique_id String,
patient_unique_id String,
attribute_name String,
attribute_value String,
cancer_study_identifier String,
type String
)
ENGINE=MergeTree
ORDER BY sample_unique_id

-- Insert sample attribute data
INSERT INTO TABLE clinical_data_derived
SELECT sm.sample_unique_id AS sample_unique_id,
sm.patient_unique_id AS patient_unique_id,
cam.attr_id AS attribute_name,
csamp.attr_value AS attribute_value,
cs.cancer_study_identifier AS cancer_study_identifier,
'sample' AS type
FROM sling_db_2024_05_23_original.sample_mv AS sm
INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs
ON sm.cancer_study_identifier = cs.cancer_study_identifier
FULL OUTER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam
ON cs.cancer_study_id = cam.cancer_study_id
FULL OUTER JOIN sling_db_2024_05_23_original.clinical_sample AS csamp
ON (sm.internal_id = csamp.internal_id) AND (csamp.attr_id = cam.attr_id)
WHERE cam.patient_attribute = 0

-- INSERT patient attribute data
INSERT INTO TABLE clinical_data_derived
SELECT NULL AS sample_unique_id,
concat(cs.cancer_study_identifier, '_', p.stable_id) AS patient_unique_id,
cam.attr_id AS attribute_name,
clinpat.attr_value AS attribute_value,
cs.cancer_study_identifier AS cancer_study_identifier,
'patient' AS type
FROM sling_db_2024_05_23_original.patient AS p
INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs ON p.cancer_study_id = cs.cancer_study_id
FULL OUTER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam
ON cs.cancer_study_id = cam.cancer_study_id
FULL OUTER JOIN sling_db_2024_05_23_original.clinical_patient AS clinpat
ON (p.internal_id = clinpat.internal_id) AND (clinpat.attr_id = cam.attr_id)
WHERE cam.patient_attribute = 1
100 changes: 83 additions & 17 deletions src/main/resources/db-scripts/clickhouse/materialized_views.sql
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
DROP TABLE IF EXISTS sample_clinical_attribute_numeric;
DROP TABLE IF EXISTS sample_clinical_attribute_categorical;
DROP TABLE IF EXISTS patient_clinical_attribute_numeric;
DROP TABLE IF EXISTS patient_clinical_attribute_categorical;
DROP TABLE IF EXISTS sample_columnstore;
DROP TABLE IF EXISTS sample_list_columnstore;
DROP TABLE IF EXISTS genomic_event;
DROP VIEW IF EXISTS sample_clinical_attribute_numeric_mv;
DROP VIEW IF EXISTS sample_clinical_attribute_categorical_mv;
DROP VIEW IF EXISTS patient_clinical_attribute_numeric_mv;
DROP VIEW IF EXISTS patient_clinical_attribute_categorical_mv;
DROP VIEW IF EXISTS sample_columnstore_mv;
DROP VIEW IF EXISTS sample_list_columnstore_mv;

Expand Down Expand Up @@ -135,17 +127,91 @@ FROM cancer_study cs
INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id
WHERE NOT match(cp.attr_value, '^[\d\.]+$');

--patient_clinical_attribute_categorical_mv
DROP VIEW IF EXISTS patient_clinical_attribute_categorical_mv;
CREATE MATERIALIZED VIEW patient_clinical_attribute_categorical_mv
TO patient_clinical_attribute_categorical AS
SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id,
cp.attr_id as attribute_name,
cp.attr_value as attribute_value,
cs.cancer_study_identifier as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id
INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id
WHERE NOT match(cp.attr_value, '^[\d\.]+$');
ENGINE = MergeTree()
ORDER BY cancer_study_identifier
SETTINGS allow_nullable_key = 1
POPULATE
AS
SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) AS patient_unique_id,
cp.attr_id AS attribute_name,
cp.attr_value AS attribute_value,
cs.cancer_study_identifier AS cancer_study_identifier
FROM clinical_patient AS cp
INNER JOIN patient AS p ON cp.internal_id = p.internal_id
INNER JOIN cancer_study AS cs ON p.cancer_study_id = cs.cancer_study_id
INNER JOIN clinical_attribute_meta AS cam
ON (cp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id)
WHERE cam.datatype = 'STRING'


--patient_clinical_attribute_numeric_mv
DROP VIEW IF EXISTS patient_clinical_attribute_numeric_mv;
CREATE MATERIALIZED VIEW patient_clinical_attribute_numeric_mv
ENGINE = MergeTree()
ORDER BY cancer_study_identifier
SETTINGS allow_nullable_key = 1
POPULATE
AS
SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) AS patient_unique_id,
cp.attr_id AS attribute_name,
cp.attr_value AS attribute_value,
cs.cancer_study_identifier AS cancer_study_identifier
FROM sling_db_2024_05_23_original.clinical_patient AS cp
INNER JOIN sling_db_2024_05_23_original.patient AS p ON cp.internal_id = p.internal_id
INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs ON p.cancer_study_id = cs.cancer_study_id
INNER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam
ON (cp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id)
WHERE cam.datatype = 'NUMBER'

-- sample_clinical_attribute_categorical_mv
DROP VIEW IF EXISTS sample_clinical_attribute_categorical_mv;
CREATE MATERIALIZED VIEW sample_clinical_attribute_categorical_mv
ENGINE = MergeTree()
ORDER BY cancer_study_identifier
SETTINGS allow_nullable_key = 1
POPULATE
AS
SELECT s.sample_unique_id,
s.patient_unique_id,
csamp.attr_id AS attribute_name,
csamp.attr_value AS attribute_value,
s.cancer_study_identifier AS cancer_study_identifier
FROM sling_db_2024_05_23_original.clinical_sample AS csamp
INNER JOIN sling_db_2024_05_23_original.sample_mv AS s ON csamp.internal_id = s.internal_id
INNER JOIN sling_db_2024_05_23_original.cancer_study AS cs
ON s.cancer_study_identifier = cs.cancer_study_identifier
INNER JOIN sling_db_2024_05_23_original.clinical_attribute_meta AS cam
ON (csamp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id)
WHERE cam.datatype = 'STRING'

-- sample_clinical_attribute_numeric_mv
DROP VIEW IF EXISTS sample_clinical_attribute_numeric_mv;
CREATE MATERIALIZED VIEW sample_clinical_attribute_numeric_mv
ENGINE = MergeTree()
ORDER BY cancer_study_identifier
SETTINGS allow_nullable_key = 1
POPULATE
AS
SELECT s.sample_unique_id,
s.patient_unique_id,
csamp.attr_id AS attribute_name,
csamp.attr_value AS attribute_value,
s.cancer_study_identifier AS cancer_study_identifier
FROM clinical_sample AS csamp
INNER JOIN sample_mv AS s ON csamp.internal_id = s.internal_id
INNER JOIN cancer_study AS cs
ON s.cancer_study_identifier = cs.cancer_study_identifier
INNER JOIN clinical_attribute_meta AS cam
ON (csamp.attr_id = cam.attr_id) AND (cs.cancer_study_id = cam.cancer_study_id)
WHERE cam.datatype = 'NUMBER'




-- sample_columnstore
CREATE TABLE IF NOT EXISTS sample_columnstore
(
sample_unique_id VARCHAR(45),
Expand Down
51 changes: 0 additions & 51 deletions src/main/resources/db-scripts/clickhouse/views.sql
Original file line number Diff line number Diff line change
@@ -1,57 +1,6 @@
DROP VIEW IF EXISTS sample_clinical_attribute_numeric_view;
DROP VIEW IF EXISTS sample_clinical_attribute_categorical_view;
DROP VIEW IF EXISTS patient_clinical_attribute_numeric_view;
DROP VIEW IF EXISTS patient_clinical_attribute_categorical_view;
DROP VIEW IF EXISTS sample_view;
DROP VIEW IF EXISTS sample_list_view;

CREATE VIEW sample_clinical_attribute_numeric_view
AS
SELECT concat(cs.cancer_study_identifier, '_', s.stable_id) as sample_unique_id,
concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id,
clinical_sample.attr_id as attribute_name,
cast(clinical_sample.attr_value as float) as attribute_value,
cs.cancer_study_identifier as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id
INNER JOIN sample s on p.internal_id = s.patient_id
INNER JOIN clinical_sample ON s.internal_id = clinical_sample.internal_id
WHERE match(clinical_sample.attr_value, '^[\d\.]+$');

CREATE VIEW sample_clinical_attribute_categorical_view
AS
SELECT concat(cs.cancer_study_identifier, '_', s.stable_id) as sample_unique_id,
concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id,
cl.attr_id as attribute_name,
cl.attr_value as attribute_value,
cs.cancer_study_identifier as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id
INNER JOIN sample s on p.internal_id = s.patient_id
INNER JOIN clinical_sample cl on s.internal_id = cl.internal_id
WHERE NOT match(cl.attr_value, '^[\d\.]+$');

CREATE VIEW patient_clinical_attribute_numeric_view
AS
SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id,
cp.attr_id as attribute_name,
cast(cp.attr_value as float) as attribute_value,
cs.cancer_study_identifier as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id
INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id
WHERE match(cp.attr_value, '^[\d\.]+$');

CREATE VIEW patient_clinical_attribute_categorical_view
AS
SELECT concat(cs.cancer_study_identifier, '_', p.stable_id) as patient_unique_id,
cp.attr_id as attribute_name,
cp.attr_value as attribute_value,
cs.cancer_study_identifier as cancer_study_identifier
FROM cancer_study cs
INNER JOIN patient p on cs.cancer_study_id = p.cancer_study_id
INNER JOIN clinical_patient cp on p.internal_id = cp.internal_id
WHERE NOT match(cp.attr_value, '^[\d\.]+$');

CREATE VIEW sample_view
AS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@
<foreach item="clinicalDataFilter" collection="categorizedClinicalDataCountFilter.getSampleNumericalClinicalDataFilters()" open="INTERSECT" separator="INTERSECT">
<include refid="numericalClinicalDataCountFilter">
<property name="unique_id" value="sample_unique_id"/>
<property name="table_name" value="sample_clinical_attribute_numeric_view"/>
<property name="table_name" value="sample_clinical_attribute_numeric_mv"/>
<property name="type" value="sample"/>
</include>
</foreach>
</if>
<if test="categorizedClinicalDataCountFilter.getSampleCategoricalClinicalDataFilters() != null and !categorizedClinicalDataCountFilter.getSampleCategoricalClinicalDataFilters().isEmpty()">
<foreach item="clinicalDataFilter" collection="categorizedClinicalDataCountFilter.getSampleCategoricalClinicalDataFilters()" open="INTERSECT" separator="INTERSECT">
<include refid="categoricalClinicalDataCountFilter">
<property name="unique_id" value="sample_unique_id"/>
<property name="table_name" value="sample_clinical_attribute_categorical_view"/>
<property name="table_name" value="clinical_data_derived"/>
<property name="type" value="sample"/>
</include>
</foreach>
</if>
Expand All @@ -68,15 +70,17 @@
<foreach item="clinicalDataFilter" collection="categorizedClinicalDataCountFilter.getPatientNumericalClinicalDataFilters()" open="INTERSECT" separator="INTERSECT">
<include refid="numericalClinicalDataCountFilter">
<property name="unique_id" value="patient_unique_id"/>
<property name="table_name" value="patient_clinical_attribute_numeric_view"/>
<property name="table_name" value="patient_clinical_attribute_numeric_mv"/>
<property name="type" value="patient"/>
</include>
</foreach>
</if>
<if test="categorizedClinicalDataCountFilter.getPatientCategoricalClinicalDataFilters() != null and !categorizedClinicalDataCountFilter.getPatientCategoricalClinicalDataFilters().isEmpty()">
<foreach item="clinicalDataFilter" collection="categorizedClinicalDataCountFilter.getPatientCategoricalClinicalDataFilters()" open="INTERSECT" separator="INTERSECT">
<include refid="categoricalClinicalDataCountFilter">
<property name="unique_id" value="patient_unique_id"/>
<property name="table_name" value="patient_clinical_attribute_categorical_view"/>
<property name="table_name" value="clinical_data_derived"/>
<property name="type" value="patient"/>
</include>
</foreach>
</if>
Expand Down Expand Up @@ -112,7 +116,8 @@
<sql id="categoricalClinicalDataCountFilter">
SELECT ${unique_id}
FROM ${table_name}
WHERE attribute_name = '${clinicalDataFilter.attributeId}'
WHERE attribute_name = '${clinicalDataFilter.attributeId}' AND
type='${type}'
<foreach item="dataFilterValue" collection="clinicalDataFilter.values" open=" AND ((" separator=") OR (" close="))">
<trim prefix="" prefixOverrides="AND">
AND attribute_value = '${dataFilterValue.value}'
Expand Down
Loading
Loading