From 3a6feefb64211802290a035b8c54d38fee6aeeb6 Mon Sep 17 00:00:00 2001 From: Colin Smith Date: Fri, 20 Dec 2024 12:41:07 -0500 Subject: [PATCH] test: create test data for term-set similarity score analysis Create a set of test data containing term-set similarity scores for various configurations, enabling unit testing of downstream functions that analyze and interpret these scores. --- tests/conftest.py | 11 +++++++++++ tests/data/benchmark/termset_similarity_scores.tsv | 11 +++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tests/data/benchmark/termset_similarity_scores.tsv diff --git a/tests/conftest.py b/tests/conftest.py index 012eee8..f6f8ba0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ from json import load import pytest +import pandas as pd from spinneret.utilities import load_workbook @@ -55,3 +56,13 @@ def termset_similarity_score_fields(): "average_test_information_content", "best_test_information_content", ] + + +@pytest.fixture(name="termset_similarity_score_dataframe") +def termset_similarity_score_dataframe(): + """Return a fixture for a dataframe of termset similarity scores returned + by the benchmark_against_standard function""" + scores = pd.read_csv( + "tests/data/benchmark/termset_similarity_scores.tsv", sep="\t", encoding="utf-8" + ) + return scores diff --git a/tests/data/benchmark/termset_similarity_scores.tsv b/tests/data/benchmark/termset_similarity_scores.tsv new file mode 100644 index 0000000..eb2fee7 --- /dev/null +++ b/tests/data/benchmark/termset_similarity_scores.tsv @@ -0,0 +1,11 @@ +standard_dir test_dir standard_file predicate_value element_xpath_value standard_set test_set average_score best_score average_jaccard_similarity best_jaccard_similarity average_phenodigm_score best_phenodigm_score average_standard_information_content best_standard_information_content average_test_information_content best_test_information_content +tests/data/benchmark/standard tests/data/benchmark/test_a knb-lter-ntl.1.59_annotation_workbook_annotated.tsv env_broad_scale /eml:eml/dataset ['ENVO:01000286', 'ENVO:01000548', 'ENVO:01000775', 'ENVO:01001021', 'ENVO:01000774', 'ENVO:01000287', 'ENVO:01000252'] ['ENVO:01000317', 'ENVO:01001209', 'ENVO:01001209'] 4.399317289600849 4.616452786848972 0.2719553079933457 0.28735632183908044 1.1051688054274622 1.1517668569518282 12.185656141890044 12.78135971352466 7.598198606401752 8.321928094887362 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.1.59_annotation_workbook_annotated.tsv contains measurements of type /eml:eml/dataset/dataTable/attributeList/attribute[31] ['ECSO:00002844'] ['ECSO:00002359', 'ECSO:00001534'] 0.0 0.0 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.1.59_annotation_workbook_annotated.tsv contains measurements of type /eml:eml/dataset/dataTable/attributeList/attribute[41] ['ECSO:00001727'] ['ECSO:00000329'] 0.0 0.0 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.1.59_annotation_workbook_annotated.tsv contains measurements of type /eml:eml/dataset/dataTable/attributeList/attribute[5] ['ECSO:00000515'] ['ECSO:00001250'] 0.0 0.0 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.1.59_annotation_workbook_annotated.tsv env_broad_scale /eml:eml/dataset ['ENVO:01000286', 'ENVO:01000548', 'ENVO:01000775', 'ENVO:01001021', 'ENVO:01000774', 'ENVO:01000287', 'ENVO:01000252'] ['ENVO:01001209'] 4.509617311638698 4.616452786848972 0.2719553079933457 0.28735632183908044 1.1051688054274622 1.1517668569518282 12.185656141890044 12.78135971352466 6.874469117916141 6.874469117916141 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.1.59_annotation_workbook_annotated.tsv usesMethod /eml:eml/dataset ['ENVTHES:21335', 'ENVTHES:20223', 'ENVTHES:21337', 'ENVTHES:20243', 'ENVTHES:20285', 'ENVTHES:21339', 'ENVTHES:20304', 'https://www.wikidata.org/wiki/Q591867', 'https://www.wikidata.org/wiki/Q5149058'] ['ENVTHES:20803', 'ENVTHES:10375', 'ENVTHES:20104', 'ENVTHES:22297', 'ENVTHES:10328'] 0.0 0.0 +tests/data/benchmark/standard tests/data/benchmark/test_a knb-lter-ntl.2.37_annotation_workbook_annotated.tsv env_broad_scale /eml:eml/dataset ['ENVO:01001021', 'ENVO:01000548', 'ENVO:01000775', 'ENVO:01000774', 'ENVO:01000286', 'ENVO:01000287', 'ENVO:01000252'] ['ENVO:00000035', 'ENVO:01001209', 'ENVO:01001209'] 4.521487919995395 4.616452786848972 0.2689232631619699 0.2840909090909091 1.0989954987335404 1.1452040294162371 12.185656141890044 12.78135971352466 9.035433165359823 11.196397212803504 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.2.37_annotation_workbook_annotated.tsv contains measurements of type /eml:eml/dataset/dataTable/attributeList/attribute[14] ['ECSO:00001799'] ['ECSO:00001120'] 0.0 0.0 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.2.37_annotation_workbook_annotated.tsv contains measurements of type /eml:eml/dataset/dataTable/attributeList/attribute[26] ['ECSO:00001720'] ['ECSO:00001534'] 0.0 0.0 +tests/data/benchmark/standard tests/data/benchmark/test_b knb-lter-ntl.2.37_annotation_workbook_annotated.tsv env_broad_scale /eml:eml/dataset ['ENVO:01001021', 'ENVO:01000548', 'ENVO:01000775', 'ENVO:01000774', 'ENVO:01000286', 'ENVO:01000287', 'ENVO:01000252'] ['ENVO:01001209'] 4.509617311638698 4.616452786848972 0.2719553079933457 0.28735632183908044 1.1051688054274622 1.1517668569518282 12.185656141890044 12.78135971352466 6.874469117916141 6.874469117916141