diff --git a/ExploringDrugResponseData.ipynb b/ExploringDrugResponseData.ipynb index 5729857..1479801 100644 --- a/ExploringDrugResponseData.ipynb +++ b/ExploringDrugResponseData.ipynb @@ -16,11 +16,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:07:20.597680Z", - "start_time": "2022-03-23T06:07:20.311775Z" + "end_time": "2022-03-23T06:23:06.680479Z", + "start_time": "2022-03-23T06:23:06.320579Z" } }, "outputs": [], @@ -40,11 +40,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:07:44.110657Z", - "start_time": "2022-03-23T06:07:21.604415Z" + "end_time": "2022-03-23T06:23:30.352564Z", + "start_time": "2022-03-23T06:23:07.123019Z" } }, "outputs": [ @@ -63,7 +63,7 @@ " dtype='object')" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" }, @@ -73,7 +73,7 @@ "(5667705, 22)" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -88,11 +88,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:08:28.760954Z", - "start_time": "2022-03-23T06:08:20.850820Z" + "end_time": "2022-03-23T06:23:44.392619Z", + "start_time": "2022-03-23T06:23:36.346842Z" } }, "outputs": [ @@ -102,29 +102,30 @@ "(5667705, 22)" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "## are there lines where everything is duplicated? no\n", + "## are there lines where every value is duplicated? no\n", "\n", "drug_response.drop_duplicates().shape" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:09:01.990867Z", - "start_time": "2022-03-23T06:08:59.426279Z" + "end_time": "2022-03-23T06:23:48.558269Z", + "start_time": "2022-03-23T06:23:45.982333Z" } }, "outputs": [], "source": [ - "## there are lines where everything we focus on is duplicated\n", + "## there are lines where everything we focus on is duplicated: subject, object, predicate, disease context\n", + "## yes\n", "duplicates = drug_response[drug_response.duplicated(subset=[\"Subject_Ensembl_gene_ID\",\n", " \"Subject_NCBI_Gene_ID\",\n", " \"Object_id\",\n", @@ -134,11 +135,36 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:23:54.880837Z", + "start_time": "2022-03-23T06:23:54.876704Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(10768, 22)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "duplicates.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:09:04.608770Z", - "start_time": "2022-03-23T06:09:04.591452Z" + "end_time": "2022-03-23T06:23:56.694550Z", + "start_time": "2022-03-23T06:23:56.675307Z" } }, "outputs": [], @@ -153,11 +179,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:09:05.429827Z", - "start_time": "2022-03-23T06:09:05.410093Z" + "end_time": "2022-03-23T06:23:57.499037Z", + "start_time": "2022-03-23T06:23:57.479394Z" } }, "outputs": [ @@ -398,7 +424,7 @@ "[5 rows x 22 columns]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -415,27 +441,69 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:26:13.234740Z", + "start_time": "2022-03-23T06:26:13.214329Z" + } + }, + "outputs": [], + "source": [ + "## sets of duplicates: 5384 \n", + "\n", + "sets_of_dups = duplicates.value_counts(subset = [\"Subject_Ensembl_gene_ID\",\"Subject_NCBI_Gene_ID\",\n", + " \"Object_id\",\"Predicate\", \"Edge_attribute_MONDO_ID\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2022-03-23T06:09:37.986230Z", - "start_time": "2022-03-23T06:09:37.982258Z" + "end_time": "2022-03-23T06:26:16.342056Z", + "start_time": "2022-03-23T06:26:16.338543Z" } }, "outputs": [ { "data": { "text/plain": [ - "(10768, 22)" + "(5384,)" ] }, - "execution_count": 8, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "duplicates.shape" + "sets_of_dups.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2022-03-23T06:26:39.157796Z", + "start_time": "2022-03-23T06:26:39.154206Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10768" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "5384 *2 ## so these are likely 5384 pairs (2 duplicates for each set)" ] }, {