From 2c1a9708d94bbf5f68b170888c9686453f04ae83 Mon Sep 17 00:00:00 2001 From: Colleen Xu Date: Tue, 22 Mar 2022 23:12:09 -0700 Subject: [PATCH] adding notebook on drug response kp api data --- ExploringDrugResponseData.ipynb | 517 ++++++++++++++++++++++++++++++++ 1 file changed, 517 insertions(+) create mode 100644 ExploringDrugResponseData.ipynb diff --git a/ExploringDrugResponseData.ipynb b/ExploringDrugResponseData.ipynb new file mode 100644 index 0000000..5729857 --- /dev/null +++ b/ExploringDrugResponseData.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Drug response kp api stuff" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:07:20.597680Z", + "start_time": "2022-03-23T06:07:20.311775Z" + } + }, + "outputs": [], + "source": [ + "## CX: allows multiple lines of code to print from one code block\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "\n", + "## to get around bugs\n", + "# import nest_asyncio\n", + "# nest_asyncio.apply()\n", + "\n", + "import pathlib\n", + "import pandas as pd\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:07:44.110657Z", + "start_time": "2022-03-23T06:07:21.604415Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Subject', 'Subject_Ensembl_gene_ID', 'Subject_NCBI_Gene_ID',\n", + " 'Subject_Approved_symbol', 'Subject_Category', 'Object', 'Object_name',\n", + " 'Object_id', 'Object_Category', 'Predicate',\n", + " 'Edge_attribute_Subject_Modifier', 'Edge_attribute_Object_Modifier',\n", + " 'Edge_attribute_method', 'Edge_attribute_Pvalue',\n", + " 'Edge_attribute_evidence_type', 'Edge_attribute_evidence_value',\n", + " 'Edge_attribute_sample_size', 'Edge_attribute_sample_orign',\n", + " 'Edge_attribute_MONDO_ID', 'Edge_attribute_DataResource',\n", + " 'Edge_attribute_Publication', 'Edge_attribute_Provider'],\n", + " dtype='object')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "(5667705, 22)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drug_path = pathlib.Path.home().joinpath('Desktop', 'TranslatorSpecificWork', \n", + " 'Table_DrugResponse_KP_v2021.11.21_rm_redundance_v2022.2.25.csv')\n", + "drug_response = pd.read_table(drug_path, sep=\",\")\n", + "drug_response.columns\n", + "drug_response.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:08:28.760954Z", + "start_time": "2022-03-23T06:08:20.850820Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(5667705, 22)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## are there lines where everything is duplicated? no\n", + "\n", + "drug_response.drop_duplicates().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:09:01.990867Z", + "start_time": "2022-03-23T06:08:59.426279Z" + } + }, + "outputs": [], + "source": [ + "## there are lines where everything we focus on is duplicated\n", + "duplicates = drug_response[drug_response.duplicated(subset=[\"Subject_Ensembl_gene_ID\",\n", + " \"Subject_NCBI_Gene_ID\",\n", + " \"Object_id\",\n", + " \"Predicate\", \n", + " \"Edge_attribute_MONDO_ID\"], keep=False)].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:09:04.608770Z", + "start_time": "2022-03-23T06:09:04.591452Z" + } + }, + "outputs": [], + "source": [ + "duplicates.sort_values(by=[\"Subject_Approved_symbol\",\n", + " \"Object_name\",\n", + " \"Predicate\", \n", + " \"Edge_attribute_MONDO_ID\"],\n", + " inplace = True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:09:05.429827Z", + "start_time": "2022-03-23T06:09:05.410093Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SubjectSubject_Ensembl_gene_IDSubject_NCBI_Gene_IDSubject_Approved_symbolSubject_CategoryObjectObject_nameObject_idObject_CategoryPredicate...Edge_attribute_methodEdge_attribute_PvalueEdge_attribute_evidence_typeEdge_attribute_evidence_valueEdge_attribute_sample_sizeEdge_attribute_sample_orignEdge_attribute_MONDO_IDEdge_attribute_DataResourceEdge_attribute_PublicationEdge_attribute_Provider
171870A1CFENSEMBL:ENSG00000148584Entrez:29974Symbol:A1CFGeneSorafenibSorafenibPUBCHEM:216239ChemicalSubstancebiolink:associated with resistance to...T-test0.018759effect_size1.30231064SCLCMONDO:0008433GDSCPMID: 27397505Multiomics-BigGIM
5115390A1CFENSEMBL:ENSG00000148584Entrez:29974Symbol:A1CFGeneSorafenibSorafenibPUBCHEM:216239ChemicalSubstancebiolink:associated with resistance to...Spearman_correlation0.001443correlation_coeffienct0.48686740SCLCMONDO:0008433GDSCPMID: 27397505Multiomics-BigGIM
5476A2MENSEMBL:ENSG00000175899Entrez:2Symbol:A2MGeneVX-702VX-702PUBCHEM:10341154ChemicalSubstancebiolink:associated with sensitivity to...T-test0.021311effect_size-1.72205326ALLMONDO:0004967GDSCPMID: 27397505Multiomics-BigGIM
281293A2MENSEMBL:ENSG00000175899Entrez:2Symbol:A2MGeneVX-702VX-702PUBCHEM:10341154ChemicalSubstancebiolink:associated with sensitivity to...Spearman_correlation0.037180correlation_coeffienct-0.43675923ALLMONDO:0004967GDSCPMID: 27397505Multiomics-BigGIM
103094AAGABENSEMBL:ENSG00000103591Entrez:79719Symbol:AAGABGeneAS601245AS601245PUBCHEM:10109823ChemicalSubstancebiolink:associated with resistance to...T-test0.010138effect_size1.85504426ALLMONDO:0004967GDSCPMID: 27397505Multiomics-BigGIM
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " Subject Subject_Ensembl_gene_ID Subject_NCBI_Gene_ID \\\n", + "171870 A1CF ENSEMBL:ENSG00000148584 Entrez:29974 \n", + "5115390 A1CF ENSEMBL:ENSG00000148584 Entrez:29974 \n", + "5476 A2M ENSEMBL:ENSG00000175899 Entrez:2 \n", + "281293 A2M ENSEMBL:ENSG00000175899 Entrez:2 \n", + "103094 AAGAB ENSEMBL:ENSG00000103591 Entrez:79719 \n", + "\n", + " Subject_Approved_symbol Subject_Category Object Object_name \\\n", + "171870 Symbol:A1CF Gene Sorafenib Sorafenib \n", + "5115390 Symbol:A1CF Gene Sorafenib Sorafenib \n", + "5476 Symbol:A2M Gene VX-702 VX-702 \n", + "281293 Symbol:A2M Gene VX-702 VX-702 \n", + "103094 Symbol:AAGAB Gene AS601245 AS601245 \n", + "\n", + " Object_id Object_Category \\\n", + "171870 PUBCHEM:216239 ChemicalSubstance \n", + "5115390 PUBCHEM:216239 ChemicalSubstance \n", + "5476 PUBCHEM:10341154 ChemicalSubstance \n", + "281293 PUBCHEM:10341154 ChemicalSubstance \n", + "103094 PUBCHEM:10109823 ChemicalSubstance \n", + "\n", + " Predicate ... Edge_attribute_method \\\n", + "171870 biolink:associated with resistance to ... T-test \n", + "5115390 biolink:associated with resistance to ... Spearman_correlation \n", + "5476 biolink:associated with sensitivity to ... T-test \n", + "281293 biolink:associated with sensitivity to ... Spearman_correlation \n", + "103094 biolink:associated with resistance to ... T-test \n", + "\n", + " Edge_attribute_Pvalue Edge_attribute_evidence_type \\\n", + "171870 0.018759 effect_size \n", + "5115390 0.001443 correlation_coeffienct \n", + "5476 0.021311 effect_size \n", + "281293 0.037180 correlation_coeffienct \n", + "103094 0.010138 effect_size \n", + "\n", + " Edge_attribute_evidence_value Edge_attribute_sample_size \\\n", + "171870 1.302310 64 \n", + "5115390 0.486867 40 \n", + "5476 -1.722053 26 \n", + "281293 -0.436759 23 \n", + "103094 1.855044 26 \n", + "\n", + " Edge_attribute_sample_orign Edge_attribute_MONDO_ID \\\n", + "171870 SCLC MONDO:0008433 \n", + "5115390 SCLC MONDO:0008433 \n", + "5476 ALL MONDO:0004967 \n", + "281293 ALL MONDO:0004967 \n", + "103094 ALL MONDO:0004967 \n", + "\n", + " Edge_attribute_DataResource Edge_attribute_Publication \\\n", + "171870 GDSC PMID: 27397505 \n", + "5115390 GDSC PMID: 27397505 \n", + "5476 GDSC PMID: 27397505 \n", + "281293 GDSC PMID: 27397505 \n", + "103094 GDSC PMID: 27397505 \n", + "\n", + " Edge_attribute_Provider \n", + "171870 Multiomics-BigGIM \n", + "5115390 Multiomics-BigGIM \n", + "5476 Multiomics-BigGIM \n", + "281293 Multiomics-BigGIM \n", + "103094 Multiomics-BigGIM \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "duplicates.head()\n", + "## notice the duplicates have different values for the following columns:\n", + "## - Edge_attribute_method\n", + "## - Edge_attribute_Pvalue\n", + "## - Edge_attribute_evidence_type\n", + "## - Edge_attribute_evidence_value\n", + "## - Edge_attribute_sample_size" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:09:37.986230Z", + "start_time": "2022-03-23T06:09:37.982258Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(10768, 22)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "duplicates.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "752.25px", + "left": "49px", + "top": "110.383px", + "width": "171.8px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}