Merge pull request #97 from NeuroDataDesign/development

Sprint 1
neurodata · Dec 14, 2018 · 5c5d9f7 · 5c5d9f7
2 parents ecc7547 + ca7a9ff
commit 5c5d9f7
Show file tree

Hide file tree

Showing 480 changed files with 50,088 additions and 1,829 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,2 +1,8 @@
 [run]
 plugins = Cython.Coverage
+
+[report]
+show_missing = True
+omit =
+    mgcpy/benchmarks/figure_2_power_curve.py
+    mgcpy/benchmarks/power_two_sample.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,13 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+
+## [0.1.0] - 2018-12-14
+### Added
+- Port [MGC](https://github.com/neurodata/mgc)/[FastMGC](https://github.com/neurodata/mgc-matlab) into the package, by [@tpsatish95](https://github.com/tpsatish95)
+- Port HHG, Pearson/RV/Cca, Spearman/Kendall into package and add data simulations, by [@sampan501](https://github.com/sampan501)
+- Port [dcorr/mcorr/mantel](https://github.com/neurodata/mgc-matlab), power estimation, and validate implementations, by [@junhaobearxiong](https://github.com/junhaobearxiong)
+- Port [MDMR](https://github.com/FCP-INDI/C-PAC/blob/master/CPAC/cwas/mdmr.pyx) into package by, [@sundaysundya](https://github.com/sundaysundya)
+- Implement Random Forest Independence Test by, [@rguo123](https://github.com/rguo123) [not in `master` yet]
+- Implement 2-sample tests into package by [@ananyas713](https://github.com/ananyas713)
diff --git a/README.md b/README.md
@@ -4,11 +4,12 @@
 [![Build Status](https://travis-ci.com/NeuroDataDesign/mgcpy.svg?branch=master)](https://travis-ci.com/NeuroDataDesign/mgcpy)
 [![PyPI](https://img.shields.io/pypi/v/mgcpy.svg)](https://pypi.org/project/mgcpy/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/mgcpy.svg)](https://pypi.org/project/mgcpy/)
+[![DockerHub](https://img.shields.io/docker/automated/tpsatish95/mgcpy.svg)](https://hub.docker.com/r/tpsatish95/mgcpy/)
 [![DOI](https://zenodo.org/badge/147731955.svg)](https://zenodo.org/badge/latestdoi/147731955)
 [![Documentation Status](https://readthedocs.org/projects/mgcpy/badge/?version=latest)](https://mgcpy.readthedocs.io/en/latest/?badge=latest)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/)
-<a href="https://codeclimate.com/github/NeuroDataDesign/mgcpy/maintainability"><img src="https://api.codeclimate.com/v1/badges/979888a65926b3f27971/maintainability" /></a>
+[![Code Climate](https://api.codeclimate.com/v1/badges/979888a65926b3f27971/maintainability)](https://codeclimate.com/github/NeuroDataDesign/mgcpy/maintainability)
 
 `mgcpy` is a Python package containing tools for multiscale graph correlation and other statistical tests, that is capable of dealing with high dimensional and multivariate data.
 
@@ -34,7 +35,7 @@ python3 setup.py install
 - To build image and run from scratch:
   - Install [docker](https://docs.docker.com/install/)
   - Build the docker image, `docker build -t mgcpy:latest .`
-    - This takes 20-30 mins to build
+    - This takes 10-15 mins to build
   - Launch the container to go into mgcpy's dev env, `docker run -it --rm --name mgcpy-env mgcpy:latest`
 - Pull image from Dockerhub and run:
   - `docker pull tpsatish95/mgcpy:latest` or `docker pull tpsatish95/mgcpy:development`
@@ -56,6 +57,10 @@ python3 setup.py install
 ## MGC Algorithm's Flow
 ![MGCPY Flow](MGCPY.png)
 
+## Power Curves
+- Recreated Figure 2 in https://arxiv.org/abs/1609.05148, with the addition of MDMR and Fast MGC
+![Power Curves](power_curves_dimensions.png)
+
 ## License
 
 This project is covered under the **Apache 2.0 License**.
diff --git a/demos/MDMR.ipynb b/demos/MDMR.ipynb
@@ -6,6 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# please comment out the next 4 lines when running on the docker, they are meant to fix the path on a personal computer\n",
     "import sys\n",
     "module_path = \"C:\\\\Users\\\\sunda\\\\Desktop\\\\AAA FA18 JHU\\\\NDD1\\\\gitscr\\\\mgcpy\"\n",
     "if module_path not in sys.path:\n",
@@ -25,17 +26,242 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[25.03876688]\n"
+      "MDMR statistic for data from R MDMR implementation = [75.11630064]\n"
      ]
     }
    ],
    "source": [
+    "# data from R's implementation of MDMR\n",
     "X = np.genfromtxt('../mgcpy/independence_tests/unit_tests/mdmr/data/X_mdmr.csv', delimiter=\",\")\n",
     "Y = np.genfromtxt('../mgcpy/independence_tests/unit_tests/mdmr/data/Y_mdmr.csv', delimiter=\",\")\n",
     "\n",
+    "#statistic for all of data\n",
     "mdmr = MDMR(compute_distance_matrix)\n",
     "a, _ = mdmr.test_statistic(X, Y)\n",
-    "print(a)"
+    "print(\"MDMR statistic for data from R MDMR implementation =\",a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MDMR statistic for data from R MDMR implementation = [75.11630064]\n",
+      "MDMR statistic for variable 1 of X = 10.399530443897767\n",
+      "MDMR statistic for variable 2 of X = 4.122633123353008\n",
+      "MDMR statistic for variable 3 of X = 11.31712736920715\n"
+     ]
+    }
+   ],
+   "source": [
+    "#statistics for each variable of X individually\n",
+    "a, indarray = mdmr.test_statistic(X, Y, individual=1)\n",
+    "print(\"MDMR statistic for data from R MDMR implementation =\",a)\n",
+    "print(\"MDMR statistic for variable 1 of X =\", indarray[0,1])\n",
+    "print(\"MDMR statistic for variable 2 of X =\", indarray[1,1])\n",
+    "print(\"MDMR statistic for variable 3 of X =\", indarray[2,1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "P-value for data from R MDMR implementation = 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "#p-value for all of the data\n",
+    "b, _ = mdmr.p_value(X, Y)\n",
+    "print(\"P-value for data from R MDMR implementation =\", b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "P-value for variable 1 of X = 0.0\n",
+      "P-value for variable 2 of X = 0.0\n",
+      "P-value for variable 3 of X = 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# p-value for each varibles of X individually\n",
+    "indarray = mdmr.ind_p_value(X, Y)\n",
+    "print(\"P-value for variable 1 of X =\", indarray[0,2])\n",
+    "print(\"P-value for variable 2 of X =\", indarray[1,2])\n",
+    "print(\"P-value for variable 3 of X =\", indarray[2,2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mgcpy.benchmarks import simulations as sims"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MDMR statistic for linear data = [2449.37187623]\n",
+      "P-value for linear data = 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Linear data\n",
+    "X, Y = sims.linear_sim(num_samp=60, num_dim=1, noise=0.1)\n",
+    "mdmr = MDMR(compute_distance_matrix)\n",
+    "statistic, _ = mdmr.test_statistic(X,Y)\n",
+    "pval, _ = mdmr.p_value(X,Y)\n",
+    "print(\"MDMR statistic for linear data =\", statistic)\n",
+    "print(\"P-value for linear data =\", pval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MDMR statistic for spiral data = [2.96644128]\n",
+      "P-value for spiral data = 0.093\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\sunda\\Downloads\\WPy-3662\\python-3.6.6.amd64\\lib\\site-packages\\mgcpy-0.0.3-py3.6-win-amd64.egg\\mgcpy\\independence_tests\\abstract_class.py:154: UserWarning: The p-value is greater than 0.05, implying that the results are not statistically significant.\n",
+      "Use results such as test_statistic and optimal_scale, with caution!\n",
+      "  \"Use results such as test_statistic and optimal_scale, with caution!\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Spiral data\n",
+    "X, Y = sims.spiral_sim(num_samp=60, num_dim=1, noise=0.1)\n",
+    "mdmr = MDMR(compute_distance_matrix)\n",
+    "statistic, _ = mdmr.test_statistic(X,Y)\n",
+    "pval, _ = mdmr.p_value(X,Y)\n",
+    "print(\"MDMR statistic for spiral data =\", statistic)\n",
+    "print(\"P-value for spiral data =\", pval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MDMR statistic for logarithmic data = [0.01367779]\n",
+      "P-value for logarithmic data = 0.903\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\sunda\\Downloads\\WPy-3662\\python-3.6.6.amd64\\lib\\site-packages\\mgcpy-0.0.3-py3.6-win-amd64.egg\\mgcpy\\independence_tests\\abstract_class.py:154: UserWarning: The p-value is greater than 0.05, implying that the results are not statistically significant.\n",
+      "Use results such as test_statistic and optimal_scale, with caution!\n",
+      "  \"Use results such as test_statistic and optimal_scale, with caution!\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Logarithmic data\n",
+    "X, Y = sims.log_sim(num_samp=60, num_dim=1, noise=0.1)\n",
+    "mdmr = MDMR(compute_distance_matrix)\n",
+    "statistic, _ = mdmr.test_statistic(X,Y)\n",
+    "pval, _ = mdmr.p_value(X,Y)\n",
+    "print(\"MDMR statistic for logarithmic data =\", statistic)\n",
+    "print(\"P-value for logarithmic data =\", pval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MDMR statistic for step data = [142.75290107]\n",
+      "P-value for step data = 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step data\n",
+    "X, Y = sims.step_sim(num_samp=60, num_dim=1, noise=0.1)\n",
+    "mdmr = MDMR(compute_distance_matrix)\n",
+    "statistic, _ = mdmr.test_statistic(X,Y)\n",
+    "pval, _ = mdmr.p_value(X,Y)\n",
+    "print(\"MDMR statistic for step data =\", statistic)\n",
+    "print(\"P-value for step data =\", pval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MDMR statistic for W data = [9.31000713e-05]\n",
+      "P-value for W data = 0.993\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\sunda\\Downloads\\WPy-3662\\python-3.6.6.amd64\\lib\\site-packages\\mgcpy-0.0.3-py3.6-win-amd64.egg\\mgcpy\\independence_tests\\abstract_class.py:154: UserWarning: The p-value is greater than 0.05, implying that the results are not statistically significant.\n",
+      "Use results such as test_statistic and optimal_scale, with caution!\n",
+      "  \"Use results such as test_statistic and optimal_scale, with caution!\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "# W data\n",
+    "X, Y = sims.w_sim(num_samp=60, num_dim=1, noise=0.1)\n",
+    "mdmr = MDMR(compute_distance_matrix)\n",
+    "statistic, _ = mdmr.test_statistic(X,Y)\n",
+    "pval, _ = mdmr.p_value(X,Y)\n",
+    "print(\"MDMR statistic for W data =\", statistic)\n",
+    "print(\"P-value for W data =\", pval)"
    ]
   },
   {

diff --git a/demos/RF-Independence Test Tutorial.ipynb b/demos/RF-Independence Test Tutorial.ipynb