diff --git a/notebooks/comparison/structure_distances_vs_dfg.ipynb b/notebooks/comparison/structure_distances_vs_dfg.ipynb
index 6451cc9..5e304e5 100644
--- a/notebooks/comparison/structure_distances_vs_dfg.ipynb
+++ b/notebooks/comparison/structure_distances_vs_dfg.ipynb
@@ -4,7 +4,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Influence of DFG conformation on structure distances"
+ "# Can fingerprint distances discriminate DFG conformations?\n",
+ "\n",
+ "The `kissim` fingerprint encodes the pocket residues' spatial distance to four centers—the pocket centroid, hinge region, DFG region and front pocket—and should therefore discriminate between two structures in different conformations; when we compare two structures in *different* conformations the fingerprint distance should be higher than for two structures in *similar* conformations.\n",
+ "\n",
+ "Let's check if this is true using DFG conformations from KLIFS. Plot distribution of fingerprint distances grouped by in/in, out/out, and in/out pairs.\n",
+ "\n",
+ "- Use fingerprint distances for structure pairs between all kinases\n",
+ "- Use fingerprint distances for structure pairs between the same kinase"
]
},
{
@@ -25,7 +32,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "de250f837c5842438c9d00f67fc311f9",
+ "model_id": "bd0eed98315c4609bccf279e27dd085d",
"version_major": 2,
"version_minor": 0
},
@@ -38,12 +45,13 @@
"source": [
"from pathlib import Path\n",
"\n",
- "import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
- "from opencadd.databases.klifs import setup_local\n",
+ "from IPython.display import display, Markdown\n",
+ "from opencadd.databases.klifs import setup_remote\n",
+ "from kissim.comparison import FingerprintDistanceGenerator\n",
"\n",
- "from kissim.comparison import FingerprintDistanceGenerator"
+ "from src.definitions import COVERAGE_CUTOFF"
]
},
{
@@ -65,60 +73,38 @@
"plt.style.use(\"seaborn\")"
]
},
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "COVERAGE_CUTOFF = 0.8"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load structural metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "klifs_session = setup_local(HERE / \"../../data/external/structures/20210114_KLIFS_HUMAN/\")\n",
- "structures = klifs_session.structures.all_structures()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Load fingerprint distances"
+ "## Load fingerprint distances with sufficient coverage\n",
+ "\n",
+ "Choose fingerprint distances that are based on spatial distances only (weighting scheme: 010) and that are based on a sufficient pairwise fingerprint bit coverage (default: `COVERAGE_CUTOFF`)."
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "fingerprint_distance_file = DATA / \"fingerprint_distances.csv\""
+ "COVERAGE_CUTOFF"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -126,21 +112,11 @@
"output_type": "stream",
"text": [
"Number of kinases: 292\n",
- "Number of structures: 4916\n"
+ "Number of structures: 4916\n",
+ "Number of structure pairs: 12081070\n",
+ "Number of structure pairs: 11733382\n"
]
- }
- ],
- "source": [
- "fingerprint_distance_generator = FingerprintDistanceGenerator.from_csv(fingerprint_distance_file)\n",
- "print(f\"Number of kinases: {len(fingerprint_distance_generator.kinase_ids)}\")\n",
- "print(f\"Number of structures: {len(fingerprint_distance_generator.structure_ids)}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
+ },
{
"data": {
"text/html": [
@@ -177,8 +153,8 @@
"
6679 \n",
" AAK1 \n",
" AAK1 \n",
- " 0.051608 \n",
- " 0.992667 \n",
+ " 0.053935 \n",
+ " 1.0 \n",
" \n",
" \n",
" 1 \n",
@@ -186,8 +162,8 @@
" 7156 \n",
" AAK1 \n",
" AAK1 \n",
- " 0.059276 \n",
- " 0.991333 \n",
+ " 0.065967 \n",
+ " 1.0 \n",
" \n",
" \n",
" 2 \n",
@@ -195,8 +171,8 @@
" 1104 \n",
" AAK1 \n",
" ABL1 \n",
- " 0.275259 \n",
- " 0.990667 \n",
+ " 0.146598 \n",
+ " 1.0 \n",
" \n",
" \n",
" 3 \n",
@@ -204,8 +180,8 @@
" 1065 \n",
" AAK1 \n",
" ABL1 \n",
- " 0.284854 \n",
- " 0.990667 \n",
+ " 0.150158 \n",
+ " 1.0 \n",
" \n",
" \n",
" 4 \n",
@@ -213,8 +189,8 @@
" 1090 \n",
" AAK1 \n",
" ABL1 \n",
- " 0.262851 \n",
- " 0.990667 \n",
+ " 0.126940 \n",
+ " 1.0 \n",
" \n",
" \n",
"\n",
@@ -222,20 +198,31 @@
],
"text/plain": [
" structure.1 structure.2 kinase.1 kinase.2 distance bit_coverage\n",
- "0 3835 6679 AAK1 AAK1 0.051608 0.992667\n",
- "1 3835 7156 AAK1 AAK1 0.059276 0.991333\n",
- "2 3835 1104 AAK1 ABL1 0.275259 0.990667\n",
- "3 3835 1065 AAK1 ABL1 0.284854 0.990667\n",
- "4 3835 1090 AAK1 ABL1 0.262851 0.990667"
+ "0 3835 6679 AAK1 AAK1 0.053935 1.0\n",
+ "1 3835 7156 AAK1 AAK1 0.065967 1.0\n",
+ "2 3835 1104 AAK1 ABL1 0.146598 1.0\n",
+ "3 3835 1065 AAK1 ABL1 0.150158 1.0\n",
+ "4 3835 1090 AAK1 ABL1 0.126940 1.0"
]
},
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "# Set path\n",
+ "fingerprint_distance_file = DATA / \"fingerprint_distances_010.csv\"\n",
+ "# Load data\n",
+ "fingerprint_distance_generator = FingerprintDistanceGenerator.from_csv(fingerprint_distance_file)\n",
+ "print(f\"Number of kinases: {len(fingerprint_distance_generator.kinase_ids)}\")\n",
+ "print(f\"Number of structures: {len(fingerprint_distance_generator.structure_ids)}\")\n",
"structure_distances = fingerprint_distance_generator.data\n",
+ "print(f\"Number of structure pairs: {structure_distances.shape[0]}\")\n",
+ "structure_distances = structure_distances[\n",
+ " structure_distances[\"bit_coverage\"] >= COVERAGE_CUTOFF\n",
+ "].reset_index(drop=True)\n",
+ "print(f\"Number of structure pairs: {structure_distances.shape[0]}\")\n",
"structure_distances.head()"
]
},
@@ -243,45 +230,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Sort fingerprint/kinase pairs (alphabetically)"
- ]
- },
- {
- "cell_type": "raw",
- "metadata": {},
- "source": [
- "def sort_structure_pairs(row):\n",
- " if row[\"structure.1\"] > row[\"structure.2\"]:\n",
- " structure_tmp = row[\"structure.1\"]\n",
- " row[\"structure.1\"] = row[\"structure.2\"]\n",
- " row[\"structure.2\"] = structure_tmp\n",
- " kinase_tmp = row[\"kinase.1\"]\n",
- " row[\"kinase.1\"] = row[\"kinase.2\"]\n",
- " row[\"kinase.2\"] = kinase_tmp\n",
- " return row"
- ]
- },
- {
- "cell_type": "raw",
- "metadata": {},
- "source": [
- "%%time\n",
- "structure_distances = structure_distances[:100000].apply(lambda x: sort_structure_pairs(x), axis=1)\n",
- "structure_distances.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Filter distances by coverage cutoff"
+ "## Add DFG conformation\n",
+ "\n",
+ "Add DFG conformation from KLIFS to each structure pair."
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 7,
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 1min 11s, sys: 477 ms, total: 1min 11s\n",
+ "Wall time: 1min 13s\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -309,7 +275,8 @@
" kinase.2 \n",
" distance \n",
" bit_coverage \n",
- " distance_filtered \n",
+ " dfg.1 \n",
+ " dfg.2 \n",
" \n",
" \n",
" \n",
@@ -319,9 +286,10 @@
" 6679 \n",
" AAK1 \n",
" AAK1 \n",
- " 0.051608 \n",
- " 0.992667 \n",
- " 0.051608 \n",
+ " 0.053935 \n",
+ " 1.0 \n",
+ " in \n",
+ " in \n",
" \n",
" \n",
" 1 \n",
@@ -329,9 +297,10 @@
" 7156 \n",
" AAK1 \n",
" AAK1 \n",
- " 0.059276 \n",
- " 0.991333 \n",
- " 0.059276 \n",
+ " 0.065967 \n",
+ " 1.0 \n",
+ " in \n",
+ " in \n",
" \n",
" \n",
" 2 \n",
@@ -339,9 +308,10 @@
" 1104 \n",
" AAK1 \n",
" ABL1 \n",
- " 0.275259 \n",
- " 0.990667 \n",
- " 0.275259 \n",
+ " 0.146598 \n",
+ " 1.0 \n",
+ " in \n",
+ " out-like \n",
" \n",
" \n",
" 3 \n",
@@ -349,9 +319,10 @@
" 1065 \n",
" AAK1 \n",
" ABL1 \n",
- " 0.284854 \n",
- " 0.990667 \n",
- " 0.284854 \n",
+ " 0.150158 \n",
+ " 1.0 \n",
+ " in \n",
+ " out \n",
" \n",
" \n",
" 4 \n",
@@ -359,126 +330,144 @@
" 1090 \n",
" AAK1 \n",
" ABL1 \n",
- " 0.262851 \n",
- " 0.990667 \n",
- " 0.262851 \n",
+ " 0.126940 \n",
+ " 1.0 \n",
+ " in \n",
+ " in \n",
" \n",
" \n",
"\n",
""
],
"text/plain": [
- " structure.1 structure.2 kinase.1 kinase.2 distance bit_coverage \\\n",
- "0 3835 6679 AAK1 AAK1 0.051608 0.992667 \n",
- "1 3835 7156 AAK1 AAK1 0.059276 0.991333 \n",
- "2 3835 1104 AAK1 ABL1 0.275259 0.990667 \n",
- "3 3835 1065 AAK1 ABL1 0.284854 0.990667 \n",
- "4 3835 1090 AAK1 ABL1 0.262851 0.990667 \n",
+ " structure.1 structure.2 kinase.1 kinase.2 distance bit_coverage dfg.1 \\\n",
+ "0 3835 6679 AAK1 AAK1 0.053935 1.0 in \n",
+ "1 3835 7156 AAK1 AAK1 0.065967 1.0 in \n",
+ "2 3835 1104 AAK1 ABL1 0.146598 1.0 in \n",
+ "3 3835 1065 AAK1 ABL1 0.150158 1.0 in \n",
+ "4 3835 1090 AAK1 ABL1 0.126940 1.0 in \n",
"\n",
- " distance_filtered \n",
- "0 0.051608 \n",
- "1 0.059276 \n",
- "2 0.275259 \n",
- "3 0.284854 \n",
- "4 0.262851 "
+ " dfg.2 \n",
+ "0 in \n",
+ "1 in \n",
+ "2 out-like \n",
+ "3 out \n",
+ "4 in "
]
},
- "execution_count": 11,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "structure_distances[\"distance_filtered\"] = np.where(\n",
- " structure_distances[\"bit_coverage\"].values < COVERAGE_CUTOFF,\n",
- " np.nan,\n",
- " structure_distances[\"distance\"].values,\n",
- ")\n",
+ "%%time\n",
+ "klifs_session = setup_remote()\n",
+ "structures = klifs_session.structures.all_structures()\n",
+ "dfg = structures.set_index(\"structure.klifs_id\")[\"structure.dfg\"]\n",
+ "structure_distances[\"dfg.1\"] = structure_distances[\"structure.1\"].apply(lambda x: dfg[x])\n",
+ "structure_distances[\"dfg.2\"] = structure_distances[\"structure.2\"].apply(lambda x: dfg[x])\n",
"structure_distances.head()"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plot DFG conformation pairs\n",
+ "\n",
+ "Group the structure pairs by DFG conformation pairs—in/in, out/out, in/out—and plot their fingerprint distance distributions."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 8,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Structure pairs: 12081070\n",
- "Structure pairs not-NaN: 11964546\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "print(f\"Structure pairs: {structure_distances.shape[0]}\")\n",
- "structure_distances = structure_distances[structure_distances[\"distance_filtered\"].notna()]\n",
- "print(f\"Structure pairs not-NaN: {structure_distances.shape[0]}\")"
+ "def structure_distances_by_dfg_conformation_pairs(structure_distances):\n",
+ " \"\"\"Distances for all, in/in, out/out, and in/out structure pairs.\"\"\"\n",
+ "\n",
+ " dfg_all = structure_distances[\"distance\"]\n",
+ " dfg_in_in = structure_distances[\n",
+ " (structure_distances[\"dfg.1\"] == \"in\") & (structure_distances[\"dfg.2\"] == \"in\")\n",
+ " ][\"distance\"]\n",
+ " dfg_out_out = structure_distances[\n",
+ " (structure_distances[\"dfg.1\"] == \"out\") & (structure_distances[\"dfg.2\"] == \"out\")\n",
+ " ][\"distance\"]\n",
+ " dfg_in_out = structure_distances[\n",
+ " ((structure_distances[\"dfg.1\"] == \"in\") & (structure_distances[\"dfg.2\"] == \"out\"))\n",
+ " | ((structure_distances[\"dfg.1\"] == \"out\") & (structure_distances[\"dfg.2\"] == \"in\"))\n",
+ " ][\"distance\"]\n",
+ "\n",
+ " structure_distances_dfg = pd.DataFrame(\n",
+ " {\"all\": dfg_all, \"in/in\": dfg_in_in, \"out/out\": dfg_out_out, \"in/out\": dfg_in_out}\n",
+ " )\n",
+ " structure_distances_dfg = pd.DataFrame(structure_distances_dfg)\n",
+ "\n",
+ " return structure_distances_dfg"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 9,
"metadata": {},
+ "outputs": [],
"source": [
- "## DFG conformation"
+ "def plot_structure_distances_by_dfg_conformation_pairs(structure_distances):\n",
+ " \"\"\"Plot distribution of structure distances per DFG conformation pair.\"\"\"\n",
+ "\n",
+ " # Data\n",
+ " structure_distances_dfg = structure_distances_by_dfg_conformation_pairs(structure_distances)\n",
+ " print(\"Number of structure pairs per conformation pair:\")\n",
+ " print(structure_distances_dfg.notna().sum())\n",
+ "\n",
+ " # Boxplot\n",
+ " structure_distances_dfg.plot(\n",
+ " kind=\"box\", title=\"Conformation dependent structure pair distances\"\n",
+ " )\n",
+ " plt.show()\n",
+ "\n",
+ " # Stats\n",
+ " display(structure_distances_dfg.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Plot DFG-in vs. DFG-out"
+ "### All structures\n",
+ "\n",
+ "Use fingerprint distances for structure pairs between all kinases."
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 10,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- ":2: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "CPU times: user 1min 2s, sys: 371 ms, total: 1min 2s\n",
- "Wall time: 1min 2s\n"
+ "Number of structure pairs per conformation pair:\n",
+ "all 11733382\n",
+ "in/in 8674987\n",
+ "out/out 99967\n",
+ "in/out 1865839\n",
+ "dtype: int64\n"
]
},
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- ":3: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
- ]
- }
- ],
- "source": [
- "%%time\n",
- "dfg = structures.set_index(\"structure.klifs_id\")[\"structure.dfg\"]\n",
- "structure_distances[\"dfg.1\"] = structure_distances[\"structure.1\"].apply(lambda x: dfg[x])\n",
- "structure_distances[\"dfg.2\"] = structure_distances[\"structure.2\"].apply(lambda x: dfg[x])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
"text/html": [
@@ -500,158 +489,327 @@
" \n",
" \n",
" \n",
- " structure.1 \n",
- " structure.2 \n",
- " kinase.1 \n",
- " kinase.2 \n",
- " distance \n",
- " bit_coverage \n",
- " distance_filtered \n",
- " dfg.1 \n",
- " dfg.2 \n",
+ " all \n",
+ " in/in \n",
+ " out/out \n",
+ " in/out \n",
" \n",
" \n",
" \n",
" \n",
- " 0 \n",
- " 3835 \n",
- " 6679 \n",
- " AAK1 \n",
- " AAK1 \n",
- " 0.051608 \n",
- " 0.992667 \n",
- " 0.051608 \n",
- " in \n",
- " in \n",
+ " count \n",
+ " 1.173338e+07 \n",
+ " 8.674987e+06 \n",
+ " 99967.000000 \n",
+ " 1.865839e+06 \n",
" \n",
" \n",
- " 1 \n",
- " 3835 \n",
- " 7156 \n",
- " AAK1 \n",
- " AAK1 \n",
- " 0.059276 \n",
- " 0.991333 \n",
- " 0.059276 \n",
- " in \n",
- " in \n",
+ " mean \n",
+ " 1.518736e-01 \n",
+ " 1.489599e-01 \n",
+ " 0.137750 \n",
+ " 1.607469e-01 \n",
" \n",
" \n",
- " 2 \n",
- " 3835 \n",
- " 1104 \n",
- " AAK1 \n",
- " ABL1 \n",
- " 0.275259 \n",
- " 0.990667 \n",
- " 0.275259 \n",
- " in \n",
- " out-like \n",
+ " std \n",
+ " 5.115106e-02 \n",
+ " 5.185786e-02 \n",
+ " 0.055707 \n",
+ " 4.802140e-02 \n",
" \n",
" \n",
- " 3 \n",
- " 3835 \n",
- " 1065 \n",
- " AAK1 \n",
- " ABL1 \n",
- " 0.284854 \n",
- " 0.990667 \n",
- " 0.284854 \n",
- " in \n",
- " out \n",
+ " min \n",
+ " 0.000000e+00 \n",
+ " 0.000000e+00 \n",
+ " 0.000000 \n",
+ " 1.784707e-02 \n",
" \n",
" \n",
- " 4 \n",
- " 3835 \n",
- " 1090 \n",
- " AAK1 \n",
- " ABL1 \n",
- " 0.262851 \n",
- " 0.990667 \n",
- " 0.262851 \n",
- " in \n",
- " in \n",
+ " 25% \n",
+ " 1.166886e-01 \n",
+ " 1.121125e-01 \n",
+ " 0.104420 \n",
+ " 1.280648e-01 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 1.466822e-01 \n",
+ " 1.444314e-01 \n",
+ " 0.125542 \n",
+ " 1.517770e-01 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 1.793193e-01 \n",
+ " 1.780976e-01 \n",
+ " 0.159610 \n",
+ " 1.822587e-01 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 5.353785e-01 \n",
+ " 5.353785e-01 \n",
+ " 0.422093 \n",
+ " 4.958527e-01 \n",
" \n",
" \n",
"\n",
""
],
"text/plain": [
- " structure.1 structure.2 kinase.1 kinase.2 distance bit_coverage \\\n",
- "0 3835 6679 AAK1 AAK1 0.051608 0.992667 \n",
- "1 3835 7156 AAK1 AAK1 0.059276 0.991333 \n",
- "2 3835 1104 AAK1 ABL1 0.275259 0.990667 \n",
- "3 3835 1065 AAK1 ABL1 0.284854 0.990667 \n",
- "4 3835 1090 AAK1 ABL1 0.262851 0.990667 \n",
- "\n",
- " distance_filtered dfg.1 dfg.2 \n",
- "0 0.051608 in in \n",
- "1 0.059276 in in \n",
- "2 0.275259 in out-like \n",
- "3 0.284854 in out \n",
- "4 0.262851 in in "
+ " all in/in out/out in/out\n",
+ "count 1.173338e+07 8.674987e+06 99967.000000 1.865839e+06\n",
+ "mean 1.518736e-01 1.489599e-01 0.137750 1.607469e-01\n",
+ "std 5.115106e-02 5.185786e-02 0.055707 4.802140e-02\n",
+ "min 0.000000e+00 0.000000e+00 0.000000 1.784707e-02\n",
+ "25% 1.166886e-01 1.121125e-01 0.104420 1.280648e-01\n",
+ "50% 1.466822e-01 1.444314e-01 0.125542 1.517770e-01\n",
+ "75% 1.793193e-01 1.780976e-01 0.159610 1.822587e-01\n",
+ "max 5.353785e-01 5.353785e-01 0.422093 4.958527e-01"
]
},
- "execution_count": 14,
"metadata": {},
- "output_type": "execute_result"
+ "output_type": "display_data"
}
],
"source": [
- "structure_distances.head()"
+ "plot_structure_distances_by_dfg_conformation_pairs(structure_distances)"
]
},
{
- "cell_type": "code",
- "execution_count": 16,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "structure_distances_dfg = {\n",
- " \"all\": structure_distances[\"distance_filtered\"],\n",
- " \"in/in\": structure_distances[\n",
- " (structure_distances[\"dfg.1\"] == \"in\") & (structure_distances[\"dfg.2\"] == \"in\")\n",
- " ][\"distance_filtered\"],\n",
- " \"out/out\": structure_distances[\n",
- " (structure_distances[\"dfg.1\"] == \"out\") & (structure_distances[\"dfg.2\"] == \"out\")\n",
- " ][\"distance_filtered\"],\n",
- " \"in/out\": structure_distances[\n",
- " ((structure_distances[\"dfg.1\"] == \"in\") & (structure_distances[\"dfg.2\"] == \"out\"))\n",
- " | ((structure_distances[\"dfg.1\"] == \"out\") & (structure_distances[\"dfg.2\"] == \"in\"))\n",
- " ][\"distance_filtered\"],\n",
- "}\n",
- "structure_distances_dfg = pd.DataFrame(structure_distances_dfg)"
+ "\n",
+ "\n",
+ "When including all kinases at the same time, the distribution of fingerprint distances is similar for structure pairs with the same DFG conformations (in/in and out/out) and different DFG conformations (in/out). \n",
+ " \n",
+ "The fingerprint seems not to discriminate DFG-conformations on a kinome-wide level, maybe because the encoded spatial information is not restricted to only DFG conformation features. We may see a disciminative effect when comparing structures for a single kinase.\n",
+ "\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Structures for one kinase\n",
+ "\n",
+ "Use fingerprint distances for structure pairs between the same kinase; use only kinases that have a sufficient number of structures in DFG-in and DFG-out conformations (default: 10). "
]
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def kinases_with_high_dfg_in_out_coverage(structure_distances, dfg_structure_coverage_cutoff=10):\n",
+ " \"\"\"Given a dataset, get kinases with a threshold DFG in/out coverage.\"\"\"\n",
+ "\n",
+ " # Get structure KLIFS IDs in our dataset\n",
+ " structure_klifs_ids = (\n",
+ " pd.concat(\n",
+ " [\n",
+ " structure_distances[\"structure.1\"].drop_duplicates(),\n",
+ " structure_distances[\"structure.2\"].drop_duplicates(),\n",
+ " ]\n",
+ " )\n",
+ " .drop_duplicates()\n",
+ " .to_list()\n",
+ " )\n",
+ " print(f\"Number of structures: {len(structure_klifs_ids)}\")\n",
+ "\n",
+ " # Get structural metadata\n",
+ " klifs_session = setup_remote()\n",
+ " structures = klifs_session.structures.all_structures()\n",
+ " structures = structures[structures[\"structure.klifs_id\"].isin(structure_klifs_ids)]\n",
+ "\n",
+ " # Count number of structures per kinase and conformation\n",
+ " dfg_by_kinase = structures.groupby(\"kinase.klifs_name\").apply(\n",
+ " lambda x: x[\"structure.dfg\"].value_counts()\n",
+ " )\n",
+ " dfg_by_kinase = dfg_by_kinase.reset_index()\n",
+ " dfg_by_kinase.columns = [\"kinase\", \"dfg\", \"n_structures\"]\n",
+ "\n",
+ " # Keep only in/out rows\n",
+ " dfg_by_kinase = dfg_by_kinase[(dfg_by_kinase[\"dfg\"] == \"in\") | (dfg_by_kinase[\"dfg\"] == \"out\")]\n",
+ "\n",
+ " # Keep only rows with at least xxx structures\n",
+ " dfg_by_kinase = dfg_by_kinase[dfg_by_kinase[\"n_structures\"] >= dfg_structure_coverage_cutoff]\n",
+ "\n",
+ " # Keep only kinases with both in/out conformations\n",
+ " n_conformations_by_kinase = dfg_by_kinase.groupby(\"kinase\").size()\n",
+ " dfg_by_kinase = dfg_by_kinase[\n",
+ " dfg_by_kinase[\"kinase\"].isin(\n",
+ " n_conformations_by_kinase[n_conformations_by_kinase == 2].index\n",
+ " )\n",
+ " ]\n",
+ "\n",
+ " return dfg_by_kinase.set_index([\"kinase\", \"dfg\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of structures: 4846\n"
+ ]
+ },
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " n_structures \n",
+ " \n",
+ " \n",
+ " kinase \n",
+ " dfg \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " BRAF \n",
+ " in \n",
+ " 52 \n",
+ " \n",
+ " \n",
+ " out \n",
+ " 29 \n",
+ " \n",
+ " \n",
+ " EGFR \n",
+ " in \n",
+ " 144 \n",
+ " \n",
+ " \n",
+ " out \n",
+ " 10 \n",
+ " \n",
+ " \n",
+ " EphA2 \n",
+ " in \n",
+ " 36 \n",
+ " \n",
+ " \n",
+ " out \n",
+ " 17 \n",
+ " \n",
+ " \n",
+ " MET \n",
+ " in \n",
+ " 49 \n",
+ " \n",
+ " \n",
+ " out \n",
+ " 16 \n",
+ " \n",
+ " \n",
+ " p38a \n",
+ " in \n",
+ " 127 \n",
+ " \n",
+ " \n",
+ " out \n",
+ " 74 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "all 11964546\n",
- "in/in 8986558\n",
- "out/out 80675\n",
- "in/out 1706654\n",
- "dtype: int64"
+ " n_structures\n",
+ "kinase dfg \n",
+ "BRAF in 52\n",
+ " out 29\n",
+ "EGFR in 144\n",
+ " out 10\n",
+ "EphA2 in 36\n",
+ " out 17\n",
+ "MET in 49\n",
+ " out 16\n",
+ "p38a in 127\n",
+ " out 74"
]
},
- "execution_count": 17,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "structure_distances_dfg.notna().sum()"
+ "dfg_by_kinase = kinases_with_high_dfg_in_out_coverage(\n",
+ " structure_distances, dfg_structure_coverage_cutoff=10\n",
+ ")\n",
+ "dfg_by_kinase"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 13,
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "text/markdown": [
+ "#### BRAF"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of DFG-in structures: 52\n",
+ "Number of DFG-out structures: 29\n",
+ "Percentage of DFG-in: 64.2%\n",
+ "Number of structure pairs per conformation pair:\n",
+ "all 3240\n",
+ "in/in 1326\n",
+ "out/out 406\n",
+ "in/out 1508\n",
+ "dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
"text/html": [
@@ -682,103 +840,668 @@
" \n",
" \n",
" count \n",
- " 1.196455e+07 \n",
- " 8.986558e+06 \n",
- " 80675.000000 \n",
- " 1.706654e+06 \n",
+ " 3240.000000 \n",
+ " 1326.000000 \n",
+ " 406.000000 \n",
+ " 1508.000000 \n",
" \n",
" \n",
" mean \n",
- " 2.591149e-01 \n",
- " 2.568413e-01 \n",
- " 0.233366 \n",
- " 2.609551e-01 \n",
+ " 0.114400 \n",
+ " 0.102131 \n",
+ " 0.069350 \n",
+ " 0.137317 \n",
" \n",
" \n",
" std \n",
- " 6.416281e-02 \n",
- " 6.517848e-02 \n",
- " 0.065284 \n",
- " 5.454680e-02 \n",
+ " 0.041485 \n",
+ " 0.039575 \n",
+ " 0.031290 \n",
+ " 0.029225 \n",
" \n",
" \n",
" min \n",
- " 0.000000e+00 \n",
- " 0.000000e+00 \n",
- " 0.000000 \n",
- " 1.266475e-02 \n",
+ " 0.006776 \n",
+ " 0.006776 \n",
+ " 0.017509 \n",
+ " 0.062493 \n",
" \n",
" \n",
" 25% \n",
- " 2.246006e-01 \n",
- " 2.223800e-01 \n",
- " 0.199945 \n",
- " 2.305254e-01 \n",
+ " 0.088342 \n",
+ " 0.081921 \n",
+ " 0.040331 \n",
+ " 0.119979 \n",
" \n",
" \n",
" 50% \n",
- " 2.509275e-01 \n",
- " 2.486435e-01 \n",
- " 0.238215 \n",
- " 2.545010e-01 \n",
+ " 0.119931 \n",
+ " 0.105342 \n",
+ " 0.063570 \n",
+ " 0.135387 \n",
" \n",
" \n",
" 75% \n",
- " 2.833567e-01 \n",
- " 2.811818e-01 \n",
- " 0.267869 \n",
- " 2.831307e-01 \n",
+ " 0.145842 \n",
+ " 0.137047 \n",
+ " 0.097643 \n",
+ " 0.163447 \n",
" \n",
" \n",
" max \n",
- " 6.492761e-01 \n",
- " 6.359691e-01 \n",
- " 0.513118 \n",
- " 6.446349e-01 \n",
+ " 0.195835 \n",
+ " 0.171465 \n",
+ " 0.139937 \n",
+ " 0.195835 \n",
" \n",
" \n",
"\n",
""
],
"text/plain": [
- " all in/in out/out in/out\n",
- "count 1.196455e+07 8.986558e+06 80675.000000 1.706654e+06\n",
- "mean 2.591149e-01 2.568413e-01 0.233366 2.609551e-01\n",
- "std 6.416281e-02 6.517848e-02 0.065284 5.454680e-02\n",
- "min 0.000000e+00 0.000000e+00 0.000000 1.266475e-02\n",
- "25% 2.246006e-01 2.223800e-01 0.199945 2.305254e-01\n",
- "50% 2.509275e-01 2.486435e-01 0.238215 2.545010e-01\n",
- "75% 2.833567e-01 2.811818e-01 0.267869 2.831307e-01\n",
- "max 6.492761e-01 6.359691e-01 0.513118 6.446349e-01"
+ " all in/in out/out in/out\n",
+ "count 3240.000000 1326.000000 406.000000 1508.000000\n",
+ "mean 0.114400 0.102131 0.069350 0.137317\n",
+ "std 0.041485 0.039575 0.031290 0.029225\n",
+ "min 0.006776 0.006776 0.017509 0.062493\n",
+ "25% 0.088342 0.081921 0.040331 0.119979\n",
+ "50% 0.119931 0.105342 0.063570 0.135387\n",
+ "75% 0.145842 0.137047 0.097643 0.163447\n",
+ "max 0.195835 0.171465 0.139937 0.195835"
]
},
- "execution_count": 18,
"metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "structure_distances_dfg.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/markdown": [
+ "#### EGFR"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of DFG-in structures: 144\n",
+ "Number of DFG-out structures: 10\n",
+ "Percentage of DFG-in: 93.51%\n",
+ "Number of structure pairs per conformation pair:\n",
+ "all 12720\n",
+ "in/in 10296\n",
+ "out/out 45\n",
+ "in/out 1440\n",
+ "dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " all \n",
+ " in/in \n",
+ " out/out \n",
+ " in/out \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 12720.000000 \n",
+ " 10296.000000 \n",
+ " 45.000000 \n",
+ " 1440.000000 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 0.112897 \n",
+ " 0.105605 \n",
+ " 0.068989 \n",
+ " 0.142613 \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 0.054063 \n",
+ " 0.054532 \n",
+ " 0.045126 \n",
+ " 0.040115 \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 0.006090 \n",
+ " 0.006090 \n",
+ " 0.006781 \n",
+ " 0.048179 \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 0.059780 \n",
+ " 0.054421 \n",
+ " 0.023394 \n",
+ " 0.097384 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 0.117316 \n",
+ " 0.089239 \n",
+ " 0.051204 \n",
+ " 0.155302 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 0.162900 \n",
+ " 0.159669 \n",
+ " 0.115118 \n",
+ " 0.176492 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 0.253772 \n",
+ " 0.242165 \n",
+ " 0.125830 \n",
+ " 0.253772 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " all in/in out/out in/out\n",
+ "count 12720.000000 10296.000000 45.000000 1440.000000\n",
+ "mean 0.112897 0.105605 0.068989 0.142613\n",
+ "std 0.054063 0.054532 0.045126 0.040115\n",
+ "min 0.006090 0.006090 0.006781 0.048179\n",
+ "25% 0.059780 0.054421 0.023394 0.097384\n",
+ "50% 0.117316 0.089239 0.051204 0.155302\n",
+ "75% 0.162900 0.159669 0.115118 0.176492\n",
+ "max 0.253772 0.242165 0.125830 0.253772"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/markdown": [
+ "#### EphA2"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of DFG-in structures: 36\n",
+ "Number of DFG-out structures: 17\n",
+ "Percentage of DFG-in: 67.92%\n",
+ "Number of structure pairs per conformation pair:\n",
+ "all 1378\n",
+ "in/in 630\n",
+ "out/out 136\n",
+ "in/out 612\n",
+ "dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " all \n",
+ " in/in \n",
+ " out/out \n",
+ " in/out \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 1378.000000 \n",
+ " 630.000000 \n",
+ " 136.000000 \n",
+ " 612.000000 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 0.081709 \n",
+ " 0.063346 \n",
+ " 0.042226 \n",
+ " 0.109387 \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 0.039527 \n",
+ " 0.037389 \n",
+ " 0.032650 \n",
+ " 0.019606 \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 0.005165 \n",
+ " 0.005165 \n",
+ " 0.006354 \n",
+ " 0.057455 \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 0.046306 \n",
+ " 0.037473 \n",
+ " 0.019107 \n",
+ " 0.097730 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 0.093450 \n",
+ " 0.054697 \n",
+ " 0.026715 \n",
+ " 0.108559 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 0.109744 \n",
+ " 0.093213 \n",
+ " 0.046466 \n",
+ " 0.117293 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 0.196033 \n",
+ " 0.163264 \n",
+ " 0.109037 \n",
+ " 0.196033 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " all in/in out/out in/out\n",
+ "count 1378.000000 630.000000 136.000000 612.000000\n",
+ "mean 0.081709 0.063346 0.042226 0.109387\n",
+ "std 0.039527 0.037389 0.032650 0.019606\n",
+ "min 0.005165 0.005165 0.006354 0.057455\n",
+ "25% 0.046306 0.037473 0.019107 0.097730\n",
+ "50% 0.093450 0.054697 0.026715 0.108559\n",
+ "75% 0.109744 0.093213 0.046466 0.117293\n",
+ "max 0.196033 0.163264 0.109037 0.196033"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
- "image/png": "\n",
+ "text/markdown": [
+ "#### MET"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of DFG-in structures: 49\n",
+ "Number of DFG-out structures: 16\n",
+ "Percentage of DFG-in: 75.38%\n",
+ "Number of structure pairs per conformation pair:\n",
+ "all 2211\n",
+ "in/in 1176\n",
+ "out/out 120\n",
+ "in/out 784\n",
+ "dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " all \n",
+ " in/in \n",
+ " out/out \n",
+ " in/out \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 2211.000000 \n",
+ " 1176.000000 \n",
+ " 120.000000 \n",
+ " 784.000000 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 0.107308 \n",
+ " 0.086336 \n",
+ " 0.092182 \n",
+ " 0.140695 \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 0.050339 \n",
+ " 0.052761 \n",
+ " 0.051939 \n",
+ " 0.023963 \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 0.006791 \n",
+ " 0.006791 \n",
+ " 0.015207 \n",
+ " 0.059057 \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 0.069174 \n",
+ " 0.042446 \n",
+ " 0.059070 \n",
+ " 0.124800 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 0.110559 \n",
+ " 0.073395 \n",
+ " 0.073624 \n",
+ " 0.140042 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 0.147206 \n",
+ " 0.112501 \n",
+ " 0.110035 \n",
+ " 0.156143 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 0.252115 \n",
+ " 0.229456 \n",
+ " 0.223748 \n",
+ " 0.252115 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " all in/in out/out in/out\n",
+ "count 2211.000000 1176.000000 120.000000 784.000000\n",
+ "mean 0.107308 0.086336 0.092182 0.140695\n",
+ "std 0.050339 0.052761 0.051939 0.023963\n",
+ "min 0.006791 0.006791 0.015207 0.059057\n",
+ "25% 0.069174 0.042446 0.059070 0.124800\n",
+ "50% 0.110559 0.073395 0.073624 0.140042\n",
+ "75% 0.147206 0.112501 0.110035 0.156143\n",
+ "max 0.252115 0.229456 0.223748 0.252115"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/markdown": [
+ "#### p38a"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of DFG-in structures: 127\n",
+ "Number of DFG-out structures: 74\n",
+ "Percentage of DFG-in: 63.18%\n",
+ "Number of structure pairs per conformation pair:\n",
+ "all 28920\n",
+ "in/in 8001\n",
+ "out/out 2701\n",
+ "in/out 9398\n",
+ "dtype: int64\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " all \n",
+ " in/in \n",
+ " out/out \n",
+ " in/out \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 28920.000000 \n",
+ " 8001.000000 \n",
+ " 2701.000000 \n",
+ " 9398.000000 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 0.104478 \n",
+ " 0.091817 \n",
+ " 0.086538 \n",
+ " 0.115213 \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 0.036459 \n",
+ " 0.032808 \n",
+ " 0.032475 \n",
+ " 0.032484 \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 0.000000 \n",
+ " 0.011152 \n",
+ " 0.000000 \n",
+ " 0.028474 \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 0.076598 \n",
+ " 0.066430 \n",
+ " 0.063984 \n",
+ " 0.089868 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 0.101966 \n",
+ " 0.089829 \n",
+ " 0.081929 \n",
+ " 0.111808 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 0.129760 \n",
+ " 0.114976 \n",
+ " 0.107962 \n",
+ " 0.141183 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 0.263483 \n",
+ " 0.240757 \n",
+ " 0.200164 \n",
+ " 0.252040 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " all in/in out/out in/out\n",
+ "count 28920.000000 8001.000000 2701.000000 9398.000000\n",
+ "mean 0.104478 0.091817 0.086538 0.115213\n",
+ "std 0.036459 0.032808 0.032475 0.032484\n",
+ "min 0.000000 0.011152 0.000000 0.028474\n",
+ "25% 0.076598 0.066430 0.063984 0.089868\n",
+ "50% 0.101966 0.089829 0.081929 0.111808\n",
+ "75% 0.129760 0.114976 0.107962 0.141183\n",
+ "max 0.263483 0.240757 0.200164 0.252040"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
- "structure_distances_dfg.plot(kind=\"box\", title=\"Conformation dependent structure pair distances\");"
+ "for kinase, dfg in dfg_by_kinase.reset_index().groupby(\"kinase\"):\n",
+ " display(Markdown(f\"#### {kinase}\"))\n",
+ " dfg = dfg.set_index(\"dfg\")\n",
+ " n_dfg_in = dfg.loc[\"in\", \"n_structures\"]\n",
+ " n_dfg_out = dfg.loc[\"out\", \"n_structures\"]\n",
+ " print(f\"Number of DFG-in structures: {n_dfg_in}\")\n",
+ " print(f\"Number of DFG-out structures: {n_dfg_out}\")\n",
+ " dfg_in_percentage = round(n_dfg_in / (n_dfg_in + n_dfg_out) * 100, 2)\n",
+ " print(f\"Percentage of DFG-in: {dfg_in_percentage}%\")\n",
+ " structure_distances_by_kinase = structure_distances[\n",
+ " (structure_distances[\"kinase.1\"] == kinase) & (structure_distances[\"kinase.2\"] == kinase)\n",
+ " ].reset_index(drop=True)\n",
+ " plot_structure_distances_by_dfg_conformation_pairs(structure_distances_by_kinase)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "We compare here only fingerprint distances for pairs of structures that describe the same kinase. We observe two interesting shifts:\n",
+ " \n",
+ "1. The distribution for out/out pairs is overall lower than for in/in pairs. Potential explanations: definitions for DFG-out are stricter than for DFG-in; \"real\" diversity of DFG-out structures could be still unknown due to the lower number of structures for DFG-out than for DFG-in. \n",
+ "2. The distribution of different DFG conformations (in/out) is overall higher than for equal DFG conformations (in/in and out/out). The fingerprint can discriminate DFG conformations of the same kinase.\n",
+ "\n",
+ "
"
]
}
],
@@ -798,7 +1521,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.6"
+ "version": "3.9.5"
}
},
"nbformat": 4,
diff --git a/src/definitions.py b/src/definitions.py
new file mode 100644
index 0000000..56a4815
--- /dev/null
+++ b/src/definitions.py
@@ -0,0 +1,5 @@
+"""
+Defines globals.
+"""
+
+COVERAGE_CUTOFF = 0.8
\ No newline at end of file
diff --git a/src/evaluation/__init__.py b/src/evaluation/__init__.py
index f0219d9..4b28c48 100644
--- a/src/evaluation/__init__.py
+++ b/src/evaluation/__init__.py
@@ -1 +1 @@
-from .ligand_vs_kinase_evaluator import LigandVsKinaseEvaluator # noqa: F401
+from .ligand_vs_kinase_evaluator import LigandVsKinaseEvaluator # noqa: F401
\ No newline at end of file