diff --git a/colormaps.cpp b/colormaps.cpp
new file mode 100644
index 0000000..851a5c7
--- /dev/null
+++ b/colormaps.cpp
@@ -0,0 +1,430 @@
+/**********
+ *   Copyright 2014 Samuel Bear Powell
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+\**********/
+#include "Colormaps.hpp"
+#include <algorithm>
+#include <fstream>
+#include <locale>
+
+#ifdef MSVC_ISNAN
+int isnan(double x) {
+    return _isnan(x);
+}
+int isnan(float x) {
+    return _isnanf(x);
+}
+#endif
+
+#undef min
+#undef max
+
+template<typename T, size_t N>
+static vector<T> vec(T(&arr)[N]) {
+    return vector<T>(begin(arr), end(arr));
+}
+
+uint32_t Colormap::make_format(int red_byte, int green_byte, int blue_byte, int alpha_byte, uint8_t alpha_value) {
+    return (red_byte & 3) | ((green_byte & 3) << 2) | ((blue_byte & 3) << 4) | ((alpha_byte & 3) << 6) | (uint32_t(alpha_value) << 8);
+}
+
+uint32_t Colormap::format_color(uint32_t format, uint8_t r, uint8_t g, uint8_t b) {
+    uint32_t a = ((format >> 8) & 0xff);
+    return format_color(format,r,g,b,a);
+}
+
+uint32_t Colormap::format_color(uint32_t format, uint8_t r, uint8_t g, uint8_t b, uint8_t a) {
+    uint32_t rshift = (format & 3) * 8;
+    uint32_t gshift = ((format >> 2) & 3) * 8;
+    uint32_t bshift = ((format >> 4) & 3) * 8;
+    uint32_t ashift = ((format >> 6) & 3) * 8;
+    return (a << ashift) | (uint32_t(r) << rshift) | (uint32_t(g) << gshift) | (uint32_t(b) << bshift);
+}
+
+void Colormap::get_rgb(uint32_t color, uint32_t format, uint8_t& r, uint8_t& g, uint8_t& b) {
+    uint8_t a;
+    get_rgba(color,format,r,g,b,a);
+}
+
+void Colormap::get_rgba(uint32_t color, uint32_t format, uint8_t& r, uint8_t& g, uint8_t& b, uint8_t& a) {
+    uint32_t rshift = (format & 3) * 8;
+    uint32_t gshift = ((format >> 2) & 3) * 8;
+    uint32_t bshift = ((format >> 4) & 3) * 8;
+    uint32_t ashift = ((format >> 6) & 3) * 8;
+    r = (color >> rshift)&0xff;
+    g = (color >> gshift)&0xff;
+    b = (color >> bshift)&0xff;
+    a = (color >> ashift)&0xff;
+}
+
+uint32_t Colormap::reformat_color(uint32_t color, uint32_t old_format, uint32_t new_format) {
+    uint8_t r,g,b,a;
+    get_rgba(color, old_format, r, g, b, a);
+    return format_color(new_format, r, g, b, a);
+}
+
+Colormap::Colormap() : map(1, 0), format(0), min_color(0), max_color(0), nan_color(0), min(flimits::quiet_NaN()), max(flimits::quiet_NaN()) {}
+
+Colormap::Colormap(const Colormap& cm) : map(cm.map), format(cm.format), max_color(cm.max_color), min_color(cm.min_color), nan_color(cm.nan_color), min(cm.min), max(cm.max) {}
+
+uint32_t Colormap::_apply(float val, float min, float max, float scale) const {
+    if (isnan(val))
+        return nan_color;
+    else if (val < min)
+        return min_color;
+    else if (val >= max)
+        return max_color;
+    else {
+        int idx((val - min)*scale);
+        return map[idx];
+    }
+}
+
+Colormap& Colormap::operator=(const Colormap& cm) {
+    Colormap tmp(cm);
+    swap(tmp);
+    return *this;
+}
+
+void Colormap::swap(Colormap& cm) {
+    std::swap(map, cm.map);
+    std::swap(format, cm.format);
+    std::swap(min_color, cm.min_color);
+    std::swap(max_color, cm.max_color);
+    std::swap(nan_color, cm.nan_color);
+    std::swap(min, cm.min);
+    std::swap(max, cm.max);
+}
+
+Colormap::Colormap(Colormap&& cm) : map(std::move(cm.map)), format(cm.format), max_color(cm.max_color), min_color(cm.min_color), nan_color(cm.nan_color), min(cm.min), max(cm.max) {}
+
+Colormap& Colormap::operator=(Colormap&& cm) {
+    Colormap tmp(std::move(cm));
+    swap(tmp);
+    return *this;
+}
+
+Colormap::Colormap(const vector<uint32_t>& map, uint32_t format, uint32_t min_color, uint32_t max_color, uint32_t nan_color, float min, float max) :
+map(map), format(format), min_color(min_color), max_color(max_color), nan_color(nan_color), min(min), max(max) {}
+
+Colormap::Colormap(const size_t n, const vector<float>& vals, const vector<float>& gray, uint32_t format) : map(n), format(format), min_color(0), max_color(0), nan_color(0), min(flimits::quiet_NaN()), max(flimits::quiet_NaN()) {
+    auto g = interpolate(n, vals, gray);
+    for (size_t i = 0; i < n; ++i) {
+        uint8_t x = uint8_t(g[i] * 0xff);
+        map[i] = format_color(format, x, x, x);
+    }
+    min_color = map[0];
+    max_color = map[n - 1];
+}
+Colormap::Colormap(const size_t n, const vector<float>& vals, const vector<float>& red, const vector<float>& green, const vector<float>& blue, uint32_t format) : map(n), format(format), min_color(0), max_color(0), nan_color(0), min(flimits::quiet_NaN()), max(flimits::quiet_NaN()) {
+    auto r = interpolate(n, vals, red);
+    auto g = interpolate(n, vals, green);
+    auto b = interpolate(n, vals, blue);
+    for (size_t i = 0; i < n; ++i) {
+        map[i] = format_color(format, uint8_t(r[i] * 0xff), uint8_t(g[i] * 0xff), uint8_t(b[i] * 0xff));
+    }
+    min_color = map[0];
+    max_color = map[n - 1];
+}
+Colormap::Colormap(const size_t n, const vector<pair<float, float>> &red, const vector<pair<float, float>> &green, const vector<pair<float, float>> &blue, uint32_t format) : map(n), format(format), min_color(0), max_color(0), nan_color(0), min(flimits::quiet_NaN()), max(flimits::quiet_NaN()) {
+    auto r = interpolate(n, red);
+    auto g = interpolate(n, green);
+    auto b = interpolate(n, blue);
+    for (size_t i = 0; i < n; ++i) {
+        map[i] = format_color(format, uint8_t(r[i] * 0xff), uint8_t(g[i] * 0xff), uint8_t(b[i] * 0xff));
+    }
+    min_color = map[0];
+    max_color = map[n - 1];
+}
+
+size_t Colormap::size() const {
+    return map.size();
+}
+
+void Colormap::reformat(uint32_t new_format) {
+    if(new_format != format) {
+        Colormap tmp(*this);
+        tmp.format = new_format;
+        tmp.min_color = reformat_color(min_color, format, tmp.format);
+        tmp.max_color = reformat_color(max_color, format, tmp.format);
+        tmp.nan_color = reformat_color(nan_color, format, tmp.format);
+        for (uint32_t& c : tmp.map) {
+            c = reformat_color(c, format, tmp.format);
+        }
+        swap(tmp);
+    }
+}
+
+uint32_t Colormap::apply(float val, float min, float max) const {
+    if (isnan(min)) {
+        min = this->min;
+        if (isnan(min))
+            min = 0.0f;
+    }
+    if (isnan(max)) {
+        max = this->max;
+        if (isnan(max))
+            max = 1.0f;
+    }
+    float scale = float(map.size()) / (max - min);
+    return _apply(val, min, max, scale);
+}
+
+void Colormap::apply(float val, uint8_t &r, uint8_t &g, uint8_t &b, float min, float max) const {
+    get_rgb(apply(val,min,max),format,r,g,b);
+}
+void Colormap::apply(float val, uint8_t &r, uint8_t &g, uint8_t &b, uint8_t &a, float min, float max) const {
+    get_rgba(apply(val,min,max),format,r,g,b,a);
+}
+
+
+const int Colormap::min_idx = -1;
+const int Colormap::max_idx = -2;
+const int Colormap::nan_idx = -3;
+
+uint32_t* Colormap::data() {
+    return map.data();
+}
+
+const uint32_t* Colormap::data() const {
+    return map.data();
+}
+
+const uint32_t& Colormap::get(int idx) const {
+    if(idx == min_idx) return min_color;
+    else if(idx == max_idx) return max_color;
+    else if(idx == nan_idx) return nan_color;
+    else return map[idx];
+}
+void Colormap::get(int idx, uint8_t& r, uint8_t& g, uint8_t& b) const {
+    Colormap::get_rgb(get(idx),format,r,g,b);
+}
+void Colormap::get(int idx, uint8_t& r, uint8_t& g, uint8_t& b, uint8_t& a) const {
+    Colormap::get_rgba(get(idx),format,r,g,b,a);
+}
+
+void Colormap::set(int idx, uint8_t r, uint8_t g, uint8_t b) {
+    uint32_t a = ((format >> 8) & 0xff);
+    set(idx,r,g,b,a);
+}
+
+void Colormap::set(int idx, uint8_t r, uint8_t g, uint8_t b, uint8_t a) {
+    if(idx == min_idx) min_color = format_color(format,r,g,b,a);
+    else if(idx == max_idx) max_color = format_color(format,r,g,b,a);
+    else if(idx == nan_idx) nan_color = format_color(format,r,g,b,a);
+    else map[idx] = format_color(format,r,g,b,a);
+}
+
+Colormaps::Colormaps(const std::string& filename, uint32_t format, size_t n) {
+    parse(filename, format, n);
+    ////grayscale:
+    //float zo[] = { 0.0f, 1.0f };
+    //gray = Colormap(n, vec(zo), vec(zo), format);
+    //add("gray",gray);
+    ////jet:
+    //float jv[] = { 0.000, 0.125, 0.375, 0.625, 0.875, 1.000 };
+    //float jr[] = { 0, 0, 0, 1, 1, 0.5 };
+    //float jg[] = { 0, 0, 1, 1, 0, 0 };
+    //float jb[] = { 0.5, 1, 1, 0, 0, 0 };
+    //jet = Colormap(n, vec(jv), vec(jr), vec(jg), vec(jb), format);
+    //add("jet",jet);
+    ////hsv:
+    //float hv[] = { 0, 1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 1 };
+    //float hr[] = { 1, 1, 0, 0, 0, 1, 1 };
+    //float hg[] = { 0, 1, 1, 1, 0, 0, 0 };
+    //float hb[] = { 0, 0, 0, 1, 1, 1, 0 };
+    //hsv = Colormap(n, vec(hv), vec(hr), vec(hg), vec(hb), format);
+    //add("hsv", hsv);
+}
+
+static bool parse_name(std::istream& in, std::string& name, char delim) {
+    name = "";
+    std::istream::sentry s(in);
+    if (s) while (in.good()) {
+        char c = in.get();
+        if (std::isspace(c,in.getloc())) continue; //skip space
+        if (c == delim) return true;
+        else name += c;
+    }
+    return false;
+}
+
+static bool parse_list(std::istream& in, std::vector<float>& vals) {
+    vals.clear();
+    std::istream::sentry s(in);
+    float v;
+    bool open = false;
+    if (s) while (in.good()) {
+        char c = in.get();
+        if (std::isspace(c, in.getloc())) continue;
+        if (!open && c == '{') {
+            open = true;
+            in >> v;
+            vals.push_back(v);
+        }
+        else if (open && c == ',') {
+            //number comes next
+            in >> v;
+            vals.push_back(v);
+        }
+        else if (open && c == '}') {
+            //end list
+            return true;
+        }
+        else {
+            //syntax error
+            return false;
+        }
+    }
+    return false;
+}
+
+static bool parse_pair(std::istream& in, std::pair<float,float>& p) {
+    std::istream::sentry s(in);
+    bool open=false;
+    int done = 0;
+    if (s) while (in.good()) {
+        char c = in.get();
+        if (std::isspace(c, in.getloc())) continue; //ignore whitespace
+        if (!open && c == '(') {
+            open = true;
+            in >> p.first;
+            done = 1;
+        }
+        else if (open && done == 1 && c == ',') {
+            in >> p.second;
+            done = 2;
+        }
+        else if (open && done == 2 && c == ')') {
+            return true;
+        }
+        else return false;
+    }
+    return false;
+}
+
+static bool parse_list(std::istream& in, std::vector<std::pair<float,float>>& vals) {
+    vals.clear();
+    std::istream::sentry s(in);
+    std::pair<float,float> val;
+    bool open = false;
+    if (s) while (in.good()) {
+        char c = in.get();
+        if (std::isspace(c, in.getloc())) continue; //ignore spaces
+        if (!open && c == '{') {
+            open = true;
+            if (!parse_pair(in, val)) return false;
+            vals.push_back(val);
+        }
+        else if (open && c == ',') {
+            if (!parse_pair(in, val)) return false;
+            vals.push_back(val);
+        }
+        else if (open && c == '}') {
+            return true;
+        }
+        else {
+            return false;
+        }
+    }
+    return false;
+}
+
+void Colormaps::parse(const std::string& filename, uint32_t format, size_t n) {
+    ifstream in(filename);
+    while (true) {
+        std::string name, cname;
+        if (!parse_name(in, name, ':')) return;
+        if (!parse_name(in, cname, '=')) return;
+        if (cname == "value") {
+            //we're in value, red, green, blue mode
+            std::vector<float> vals, red, green, blue;
+            if (!parse_list(in, vals)) return;
+
+            if (!parse_name(in, cname, '=')) return;
+            if (cname != "red") return;
+            if (!parse_list(in, red)) return;
+
+            if (!parse_name(in, cname, '=')) return;
+            if (cname != "green") return;
+            if (!parse_list(in, green)) return;
+
+            if (!parse_name(in, cname, '=')) return;
+            if (cname != "blue") return;
+            if (!parse_list(in, blue)) return;
+
+            add(name, Colormap(n, vals, red, green, blue, format));
+        }
+        else if (cname == "red") {
+            //we're in pairs mode
+            std::vector<std::pair<float,float>> red, green, blue;
+            if (!parse_list(in, red)) return;
+
+            if (!parse_name(in, cname, '=')) return;
+            if (cname != "green") return;
+            if (!parse_list(in, green)) return;
+
+            if (!parse_name(in, cname, '=')) return;
+            if (cname != "blue") return;
+            if (!parse_list(in, blue)) return;
+
+            add(name, Colormap(n, red, green, blue, format));
+        }
+        else {
+            return;
+        }
+    }
+}
+
+bool Colormaps::has(const string& name) const {
+    return (maps.find(name) != maps.end());
+}
+
+const Colormap& Colormaps::get(const string& name) const {
+    try {
+        return maps.at(name);
+    } catch(std::out_of_range&) {
+        throw std::runtime_error("Colormap not found: " + name);
+    }
+}
+
+Colormap& Colormaps::get(const string& name) {
+    try {
+        return maps.at(name);
+    } catch(std::out_of_range&) {
+        throw std::runtime_error("Colormap not found: " + name);
+    }
+}
+
+Colormap& Colormaps::operator[](const string& name) {
+    return maps[name];
+}
+
+Colormap& Colormaps::operator[](string&& name) {
+    return maps[move(name)];
+}
+
+Colormap& Colormaps::add(const string& name, const Colormap& cmap) {
+    if (name == "gray") gray = cmap;
+    else if (name == "jet") jet = cmap;
+    else if (name == "hsv") hsv = cmap;
+    return maps[name] = cmap;
+}
+
+Colormap& Colormaps::add(const string& name, Colormap&& cmap) {
+    if (name == "gray") gray = cmap;
+    else if (name == "jet") jet = cmap;
+    else if (name == "hsv") hsv = cmap;
+    return maps[name] = std::move(cmap);
+}
diff --git a/colormaps.hpp b/colormaps.hpp
new file mode 100644
index 0000000..1fb90d9
--- /dev/null
+++ b/colormaps.hpp
@@ -0,0 +1,220 @@
+/**********
+ *   Copyright 2014 Samuel Bear Powell
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+\**********/
+#pragma once
+#ifndef COLORMAPS_HPP
+#define COLORMAPS_HPP
+
+#include <vector>
+#include <string>
+#include <functional>
+#include <algorithm>
+#include <utility>
+#include <unordered_map>
+#include <cstdint>
+#include <limits>
+using namespace std;
+
+typedef std::numeric_limits<float> flimits;
+
+#ifdef MSVC_ISNAN
+int isnan(double x);
+int isnan(float x);
+#endif
+//A map that translates floats to uint32s
+
+struct Colormap {
+public:
+    static uint32_t make_format(int red_byte=0, int green_byte=1, int blue_byte=2, int alpha_byte=3, uint8_t alpha_value=0xff);
+    static uint32_t format_color(uint32_t format, uint8_t r, uint8_t g, uint8_t b);
+    static uint32_t format_color(uint32_t format, uint8_t r, uint8_t g, uint8_t b, uint8_t a);
+    static void get_rgb(uint32_t color, uint32_t format, uint8_t& r, uint8_t& g, uint8_t& b);
+    static void get_rgba(uint32_t color, uint32_t format, uint8_t& r, uint8_t& g, uint8_t& b, uint8_t& a);
+    static uint32_t reformat_color(uint32_t color, uint32_t old_format, uint32_t new_format);
+    static const int min_idx, max_idx, nan_idx;
+protected:
+    uint32_t _apply(float val, float min, float max, float scale) const;
+public:
+    vector<uint32_t> map;
+    uint32_t format;
+    uint32_t min_color, max_color, nan_color;
+    float min, max;
+
+
+    //constructors:
+    Colormap();
+    Colormap(const Colormap& cm); //copy construct
+    Colormap& operator=(const Colormap& cm); //copy assign
+    void swap(Colormap& cm);
+    Colormap(Colormap&& cm); //move
+    Colormap& operator=(Colormap&& cm); //move assign
+
+    size_t size() const;
+
+    //copy each member:
+    Colormap(const vector<uint32_t>& map, uint32_t format, uint32_t min_color = 0, uint32_t max_color = 0, uint32_t nan_color = 0, float min = flimits::quiet_NaN(), float max = flimits::quiet_NaN());
+    //interpolate the map:
+    Colormap(const size_t n, const vector<float>& vals, const vector<float>& gray, uint32_t format);
+    Colormap(const size_t n, const vector<float>& vals, const vector<float>& red, const vector<float>& green, const vector<float>& blue, uint32_t format);
+    Colormap(const size_t n, const vector<pair<float, float>> &red, const vector<pair<float, float>> &green, const vector<pair<float, float>> &blue, uint32_t format);
+
+    //apply the map to a float:
+    //if min, max = NAN, then we use the member min, max; if member min, max = NAN, then we use 0,1
+    uint32_t apply(float val, float min = flimits::quiet_NaN(), float max = flimits::quiet_NaN()) const;
+    void apply(float val, uint8_t &r, uint8_t &g, uint8_t &b, float min = flimits::quiet_NaN(), float max = flimits::quiet_NaN()) const;
+    void apply(float val, uint8_t &r, uint8_t &g, uint8_t &b, uint8_t &a, float min = flimits::quiet_NaN(), float max = flimits::quiet_NaN()) const;
+    uint32_t operator()(float val, float min = flimits::quiet_NaN(), float max = flimits::quiet_NaN()) const {
+        return apply(val, min, max);
+    }
+    uint32_t operator[](float val) const {
+        return apply(val);
+    }
+
+    uint32_t* data();
+    const uint32_t* data() const;
+    const uint32_t& get(int idx) const;
+    void get(int idx, uint8_t& r, uint8_t& g, uint8_t& b) const;
+    void get(int idx, uint8_t& r, uint8_t& g, uint8_t& b, uint8_t& a) const;
+
+    void set(int idx, uint8_t r, uint8_t g, uint8_t b);
+    void set(int idx, uint8_t r, uint8_t g, uint8_t b, uint8_t a);
+
+    void reformat(uint32_t new_format);
+
+    //apply to an iterable dataset
+    //if min, max are not specified, then we use member min, max; if member min, max = NAN, then we find the min, max of the dataset
+    template<typename InputIt, typename OutputIt>
+    void apply(InputIt first, InputIt last, OutputIt out_first, float min = flimits::quiet_NaN(), float max = flimits::quiet_NaN()) const {
+        //get min and max set up:
+        bool find_min = false, find_max = false;
+        if (isnan(min)) {
+            min = this->min;
+            if (isnan(min)) {
+                min = flimits::infinity();
+                find_min = true;
+            }
+        }
+        if (isnan(max)) {
+            max = this->max;
+            if (isnan(max)) {
+                max = -flimits::infinity();
+                find_max = true;
+            }
+        }
+        if (find_min || find_max) {
+            InputIt it = first;
+            while (it != last) {
+                float val = *it++;
+                if (find_min && val < min) min = val;
+                if (find_max && val > max) max = val;
+            }
+        }
+        //apply the colormap
+        float scale = float(map.size()) / (max - min);
+        while (first != last) {
+            *out_first++ = _apply(*first++, min, max, scale);
+        }
+    }
+    //apply to iterable collections (using std::begin and std::end)
+    template<typename InputT, typename OutputT>
+    void apply(const InputT& in, OutputT& out, const float& min = 0.0f, const float& max = 1.0f) const {
+        apply(std::begin(in), std::end(in), std::begin(out), min, max);
+    }
+};
+
+struct Colormaps {
+    Colormap gray, hsv, jet;
+    unordered_map<string, Colormap> maps;
+    Colormaps(const std::string& filename, uint32_t format, size_t n = 256);
+
+    void parse(const std::string& filename, uint32_t format, size_t n=256);
+
+    bool has(const string& name) const;
+    const Colormap& get(const string& name) const;
+    Colormap& get(const string& name);
+    Colormap& operator[](const string& name);
+    Colormap& operator[](string&& name);
+
+    Colormap& add(const string& name, const Colormap& cmap);
+    Colormap& add(const string& name, Colormap&& cmap);
+
+    unordered_map<string, Colormap>::iterator begin() {
+        return maps.begin();
+    }
+    unordered_map<string, Colormap>::iterator end() {
+        return maps.end();
+    }
+    unordered_map<string, Colormap>::const_iterator begin() const {
+        return maps.begin();
+    }
+    unordered_map<string, Colormap>::const_iterator end() const {
+        return maps.end();
+    }
+};
+
+template<typename value_t>
+vector<value_t> interpolate(size_t n, const vector<float>& positions, const vector<value_t> values) {
+    vector<value_t> out(n);
+    float step = 1.0f / (n - 1);
+    if (values.size() != positions.size())
+        throw runtime_error("positions and values must have same size!");
+    size_t left_idx = 0, end_idx = positions.size();
+    size_t right_idx = (left_idx + 1 == end_idx) ? left_idx : left_idx + 1;
+
+    float left_pos = positions[left_idx], right_pos = positions[right_idx];
+    value_t left_val = values[left_idx], right_val = values[right_idx];
+
+    for (size_t i = 0; i < n; ++i) {
+        float pos = i*step;
+        //increment the left/right points until they straddle the current position
+        while (right_pos <= pos && left_idx + 1 != end_idx) {
+            ++left_idx;
+            right_idx = (left_idx + 1 == end_idx) ? left_idx : left_idx + 1;
+            left_pos = positions[left_idx], right_pos = positions[right_idx];
+            left_val = values[left_idx], right_val = values[right_idx];
+        }
+        //take right or left values if below or above the position:
+        if (pos >= right_pos) out[i] = right_val;
+        else if (pos <= left_pos) out[i] = left_val;
+        //otherwise linearly interpolate between the values:
+        else out[i] = left_val + (right_val - left_val)*((pos - left_pos) / (right_pos - left_pos));
+    }
+    return out;
+}
+
+template<typename value_t>
+vector<value_t> interpolate(size_t n, const vector<pair<float, value_t>> &pts) {
+    vector<value_t> out(n);
+    float step = 1.0f / (n - 1);
+    auto left = pts.begin(), end = pts.end();
+    auto right = left + 1 == end ? left : left + 1;
+    float left_pos = left->first, right_pos = right->first;
+    value_t left_val = left->second, right_val = right->second;
+    for (size_t i = 0; i < n; ++i) {
+        float pos = i*step;
+        while (right_pos <= pos && left + 1 != end) {
+            ++left;
+            right = left + 1 == end ? left : left + 1;
+            left_pos = left->first, left_val = left->second;
+            right_pos = right->first, right_val = right->second;
+        }
+        if (pos >= right_pos) out[i] = right_val;
+        else if (pos <= left_pos) out[i] = left_val;
+        else out[i] = left_val + (right_val - left_val)*((pos - left_pos) / (right_pos - left_pos));
+    }
+    return out;
+}
+
+#endif // COLORMAPS_HPP
diff --git a/polarization.cpp b/polarization.cpp
new file mode 100644
index 0000000..a2abe45
--- /dev/null
+++ b/polarization.cpp
@@ -0,0 +1,503 @@
+/**********
+ *   Copyright 2014 Samuel Bear Powell
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+\**********/
+#include "polarization.hpp"
+#include <omp.h>
+#include <cmath>
+#include <limits>
+
+const uint8_t Polarization::default_pattern = (PIX0(0, 0) | PIX1(1, 1) | PIX2(0, 1) | PIX3(1, 0));
+
+void Polarization::trans(__m128 mat[4]) {
+    _MM_TRANSPOSE4_PS(mat[0], mat[1], mat[2], mat[3]);
+}
+
+void Polarization::inv(const __m128 mat[4], __m128 out[4]) {
+    //adapted from Intel AP-928 "Streaming SIMD Extensions - Inverse of a 4x4 Matrix"
+    __m128 minor0, minor1, minor2, minor3;
+    __m128 row0, row1, row2, row3;
+    __m128 det, tmp1;
+
+    row0 = mat[0];
+    row1 = mat[1];
+    row2 = mat[2];
+    row3 = mat[3];
+    //transpose:
+    _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
+    //Compute cofactors:
+    tmp1 = _mm_mul_ps(row2, row3);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0 = _mm_mul_ps(row1, tmp1);
+    minor1 = _mm_mul_ps(row0, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+    minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+    minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
+
+    tmp1 = _mm_mul_ps(row1, row2);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+    minor3 = _mm_mul_ps(row0, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+    minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+    minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
+
+    tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    row2 = _mm_shuffle_ps(row2, row2, 0x4E);
+    minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+    minor2 = _mm_mul_ps(row0, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+    minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+    minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
+
+    tmp1 = _mm_mul_ps(row0, row1);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+
+    tmp1 = _mm_mul_ps(row0, row3);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+    minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+    minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+
+    tmp1 = _mm_mul_ps(row0, row2);
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+    tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+    minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+    //compute determinant:
+    det = _mm_mul_ps(row0, minor0);
+    det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+    det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+    tmp1 = _mm_rcp_ss(det);
+    det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+    det = _mm_shuffle_ps(det, det, 0x00);
+    //store:
+    out[0] = _mm_mul_ps(det, minor0);
+    out[1] = _mm_mul_ps(det, minor1);
+    out[2] = _mm_mul_ps(det, minor2);
+    out[3] = _mm_mul_ps(det, minor3);
+}
+
+//perform a dot product of a row-vector and a matrix
+//the matrix is in column-major order!!
+__m128 Polarization::dot(const __m128 vec, const __m128 mat[4]) {
+    //expand vec to columns
+    __m128 vec0 = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(0, 0, 0, 0));
+    __m128 vec1 = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128 vec2 = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 vec3 = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 3, 3, 3));
+    //element-wise multiply each column of vec with each column of mat
+    __m128 prod0 = _mm_mul_ps(vec0, mat[0]);
+    __m128 prod1 = _mm_mul_ps(vec1, mat[1]);
+    __m128 prod2 = _mm_mul_ps(vec2, mat[2]);
+    __m128 prod3 = _mm_mul_ps(vec3, mat[3]);
+    //sum down each column
+    return _mm_add_ps(_mm_add_ps(prod0, prod1), _mm_add_ps(prod2, prod3));
+}
+//perform a dot product of a matrix with a column vector
+//the matrix is in row-major order!!
+__m128 Polarization::dot(const __m128 mat[4], const __m128 vec) {
+    //element-wise product of vector with each row
+    __m128 prod0 = _mm_mul_ps(vec, mat[0]);
+    __m128 prod1 = _mm_mul_ps(vec, mat[1]);
+    __m128 prod2 = _mm_mul_ps(vec, mat[2]);
+    __m128 prod3 = _mm_mul_ps(vec, mat[3]);
+    //sum each product using horizontal-add
+    return _mm_hadd_ps(_mm_hadd_ps(prod0, prod1), _mm_hadd_ps(prod2, prod3));
+}
+
+float Polarization::dot(const __m128 vec0, const __m128 vec1) {
+    __m128 prod = _mm_mul_ps(vec0, vec1);
+    return prod.m128_f32[0] + prod.m128_f32[1] + prod.m128_f32[2] + prod.m128_f32[3];
+}
+
+void Polarization::unpack_superpixels(const size_t rows, const size_t cols, const __m128* packed, __m128* out, uint8_t pattern) {
+    size_t rows_2 = rows / 2, cols_2 = cols / 2;
+    int r, c, rx2, cx2;
+    #pragma omp parallel for private(r,c,rx2,cx2)
+    for (r = 0; r < rows_2; ++r) {
+        for (c = 0; c < cols_2; ++c) {
+            rx2 = r << 1; cx2 = c << 1;
+            __m128 p = packed[IDX(r, c, cols_2)];
+            out[IDX(rx2 | PATR(pattern, 0), cx2 | PATC(pattern, 0), cols)] = _mm_setr_ps(p.m128_f32[0], 0, 0, 0);
+            out[IDX(rx2 | PATR(pattern, 1), cx2 | PATC(pattern, 1), cols)] = _mm_setr_ps(0, p.m128_f32[1], 0, 0);
+            out[IDX(rx2 | PATR(pattern, 2), cx2 | PATC(pattern, 2), cols)] = _mm_setr_ps(0, 0, p.m128_f32[2], 0);
+            out[IDX(rx2 | PATR(pattern, 3), cx2 | PATC(pattern, 3), cols)] = _mm_setr_ps(0, 0, 0, p.m128_f32[3]);
+        }
+    }
+}
+
+void Polarization::mask_low_high(const size_t n, const __m128* raw, float low, float high, __m128* out) {
+    __m128 low_vec = _mm_set1_ps(low);
+    __m128 high_vec = _mm_set1_ps(high);
+    __m128 nan_vec = _mm_set1_ps(std::numeric_limits<float>::quiet_NaN());
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        __m128 val = *raw++;
+        //mask = (val < low) | (val > high)
+        __m128 mask = _mm_or_ps(_mm_cmplt_ps(val, low_vec), _mm_cmpgt_ps(val, high_vec));
+        //val = (~mask & val) | (mask & NAN)
+        val = _mm_or_ps(_mm_andnot_ps(mask, val), _mm_and_ps(mask, nan_vec));
+        *out++ = val;
+    }
+}
+void Polarization::mask_low(const size_t n, const __m128* raw, float low, __m128* out) {
+    __m128 low_vec = _mm_set1_ps(low);
+    __m128 nan_vec = _mm_set1_ps(std::numeric_limits<float>::quiet_NaN());
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        __m128 val = *raw++;
+        //mask = (val < low)
+        __m128 mask = _mm_cmplt_ps(val, low_vec);
+        //val = (~mask & val) | (mask & NAN)
+        val = _mm_or_ps(_mm_andnot_ps(mask, val), _mm_and_ps(mask, nan_vec));
+        *out++ = val;
+    }
+}
+void Polarization::mask_high(const size_t n, const __m128* raw, float high, __m128* out) {
+    __m128 high_vec = _mm_set1_ps(high);
+    __m128 nan_vec = _mm_set1_ps(std::numeric_limits<float>::quiet_NaN());
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        __m128 val = *raw++;
+        //mask = (val > high)
+        __m128 mask = _mm_cmpgt_ps(val, high_vec);
+        //val = (~mask & val) | (mask & NAN)
+        val = _mm_or_ps(_mm_andnot_ps(mask, val), _mm_and_ps(mask, nan_vec));
+        *out++ = val;
+    }
+}
+
+void Polarization::calibrate_matrix(const size_t n, const __m128* raw, const __m128* darks, const __m128* gains, __m128* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = dot(&gains[i], _mm_sub_ps(raw[i], darks[i]));
+    }
+}
+
+void Polarization::calibrate_matrix2(const size_t n, const __m128* raw, const __m128* darks, const __m128* gains, __m128* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = dot(_mm_sub_ps(raw[i], darks[i]), &gains[i]);
+    }
+}
+
+void Polarization::filter(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt, const edge_mode::t mode) {
+    switch (mode) {
+    case edge_mode::ZERO:
+        return filter_zero(rows, cols, in, out, filt_rows, filt_cols, filt);
+    case edge_mode::WRAP:
+        return filter_wrap(rows, cols, in, out, filt_rows, filt_cols, filt);
+    case edge_mode::REFLECT:
+        return filter_reflect(rows, cols, in, out, filt_rows, filt_cols, filt);
+    }
+}
+
+void Polarization::filter(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_size, const __m128* filt, const edge_mode::t mode) {
+    switch (mode) {
+    case edge_mode::ZERO:
+        return filter_zero(rows, cols, in, out, filt_size, filt);
+    case edge_mode::WRAP:
+        return filter_wrap(rows, cols, in, out, filt_size, filt);
+    case edge_mode::REFLECT:
+        return filter_reflect(rows, cols, in, out, filt_size, filt);
+    }
+}
+
+void Polarization::filter_zero(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt)
+{
+    int roff = filt_rows / 2;
+    int coff = filt_cols / 2;
+    int r, c, fr, fc, rx, cx;
+    __m128 sum;
+    #pragma omp parallel for private(r,c,sum,fr,fc,rx,cx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = _mm_set1_ps(0);
+            for (fr = 0; fr < filt_rows; ++fr) {
+                for (fc = 0; fc < filt_cols; ++fc) {
+                    rx = r + fr - roff;
+                    cx = c + fc - coff;
+                    if (rx < 0 || cx < 0 || rx >= rows || cx >= cols)
+                        continue; //sum += 0
+                    else //sum += in[rx,cx]*filt[fr,fc];
+                        sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(rx,cx,cols)],filt[IDX(fr,fc,filt_cols)]));
+                }
+            }
+            out[IDX(r, c, cols)] = sum;
+        }
+    }
+}
+
+void Polarization::filter_zero(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_size, const __m128* filt)
+{
+    int off = filt_size / 2;
+    int r, c, fi, rx, cx;
+    __m128 sum;
+    //apply to rows
+    #pragma omp parallel for private(r,c,sum,fi,rx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = _mm_set1_ps(0);
+            for (fi = 0; fi < filt_size; ++fi) {
+                rx = r + fi - off;
+                if(rx < 0 || rx >= rows)
+                    continue;
+                else
+                    sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(rx,c,cols)],filt[fi]));
+            }
+            out[IDX(r,c,cols)] = sum;
+        }
+    }
+    //apply to columns:
+    #pragma omp parallel for private(r,c,sum,fi,cx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = out[IDX(r,c,cols)];
+            for (fi = 0; fi < filt_size; ++fi) {
+                cx = c + fi - off;
+                if(cx < 0 || cx >= cols)
+                    continue;
+                else
+                    sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(r,cx,cols)],filt[fi]));
+            }
+            out[IDX(r,c,cols)] = sum;
+        }
+    }
+}
+
+void Polarization::filter_wrap(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt)
+{
+    int roff = filt_rows / 2;
+    int coff = filt_cols / 2;
+    int r, c, fr, fc, rx, cx;
+    __m128 sum;
+    #pragma omp parallel for private(r,c,sum,fr,fc,rx,cx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = _mm_set1_ps(0);
+            for (fr = 0; fr < filt_rows; ++fr) {
+                for (fc = 0; fc < filt_cols; ++fc) {
+                    rx = WRAP(r + fr - roff, rows);
+                    cx = WRAP(c + fc - coff, cols);
+                    sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(rx, cx, cols)], filt[IDX(fr, fc, filt_cols)]));
+                }
+            }
+            out[IDX(r, c, cols)] = sum;
+        }
+    }
+}
+
+void Polarization::filter_wrap(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_size, const __m128* filt)
+{
+    int off = filt_size / 2;
+    int r, c, fi, rx, cx;
+    __m128 sum;
+    //apply to rows
+    #pragma omp parallel for private(r,c,sum,fi,rx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = _mm_set1_ps(0);
+            for (fi = 0; fi < filt_size; ++fi) {
+                rx = WRAP(r + fi - off, rows);
+                sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(rx,c,cols)],filt[fi]));
+            }
+            out[IDX(r,c,cols)] = sum;
+        }
+    }
+    //apply to columns:
+    #pragma omp parallel for private(r,c,sum,fi,cx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = out[IDX(r,c,cols)];
+            for (fi = 0; fi < filt_size; ++fi) {
+                cx = WRAP(c + fi - off, cols);
+                sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(r,cx,cols)],filt[fi]));
+            }
+            out[IDX(r,c,cols)] = sum;
+        }
+    }
+}
+
+void Polarization::filter_reflect(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt)
+{
+    int roff = filt_rows / 2;
+    int coff = filt_cols / 2;
+    int r, c, fr, fc, rx, cx;
+    __m128 sum;
+    #pragma omp parallel for private(r,c,sum,fr,fc,rx,cx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = _mm_set1_ps(0);
+            for (fr = 0; fr < filt_rows; ++fr) {
+                for (fc = 0; fc < filt_cols; ++fc) {
+                    rx = REFLECT(r + fr - roff, rows);
+                    cx = REFLECT(c + fc - coff, cols);
+                    sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(rx, cx, cols)], filt[IDX(fr, fc, filt_cols)]));
+                }
+            }
+            out[IDX(r, c, cols)] = sum;
+        }
+    }
+}
+
+void Polarization::filter_reflect(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_size, const __m128* filt)
+{
+    int off = filt_size / 2;
+    int r, c, fi, rx, cx;
+    __m128 sum;
+    //apply to rows
+    #pragma omp parallel for private(r,c,sum,fi,rx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = _mm_set1_ps(0);
+            for (fi = 0; fi < filt_size; ++fi) {
+                rx = REFLECT(r + fi - off, rows);
+                sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(rx,c,cols)],filt[fi]));
+            }
+            out[IDX(r,c,cols)] = sum;
+        }
+    }
+    //apply to columns:
+    #pragma omp parallel for private(r,c,sum,fi,cx)
+    for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+            sum = out[IDX(r,c,cols)];
+            for (fi = 0; fi < filt_size; ++fi) {
+                cx = REFLECT(c + fi - off, cols);
+                sum = _mm_add_ps(sum, _mm_mul_ps(in[IDX(r,cx,cols)],filt[fi]));
+            }
+            out[IDX(r,c,cols)] = sum;
+        }
+    }
+}
+
+//does dot(R[i], img[i]) where R[i] is row-major
+void Polarization::stokes(const size_t n, const __m128* img, const __m128* R, __m128* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = dot(&R[i], img[i]);
+    }
+}
+//does dot(R, img[i]) where R is row-major
+void Polarization::stokesR(const size_t n, const __m128* img, const __m128 R[4], __m128* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = dot(R, img[i]);
+    }
+}
+//does dot(img[i], R[i]) where R[i] is column-major
+void Polarization::stokes2(const size_t n, const __m128* img, const __m128* R, __m128* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = dot(img[i], &R[i]);
+    }
+}
+//does dot(img[i], R) where R is column-major
+void Polarization::stokes2R(const size_t n, const __m128* img, const __m128 R[4], __m128* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = dot(img[i], R);
+    }
+}
+//assumes 0,90,45,135 pixel pattern:
+void Polarization::stokes(const size_t n, const __m128* img, __m128* out) {
+    __m128 R[] = {
+        _mm_setr_ps(0.5,  1.0,  0.0, 0.0),
+        _mm_setr_ps(0.5, -1.0,  0.0, 0.0),
+        _mm_setr_ps(0.5,  0.0,  1.0, 0.0),
+        _mm_setr_ps(0.5,  0.0, -1.0, 0.0)
+    };
+    stokes2R(n, img, R, out);
+}
+
+void Polarization::element(const size_t param, const size_t n, const __m128* simg, float* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = simg[i].m128_f32[param];
+    }
+}
+
+//degree of polarization: hypot(s1,s2,s3)/s0
+void Polarization::dop(const size_t n, const __m128* simg, float* out) {
+    __m128 tmp;
+    float sum;
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        //square each element
+        tmp = _mm_mul_ps(simg[i], simg[i]);
+        //(s1^2+s2^2+s3^2)/s0^2
+        tmp.m128_f32[0] = (tmp.m128_f32[1] + tmp.m128_f32[2] + tmp.m128_f32[3])/tmp.m128_f32[0];
+        out[i] = _mm_sqrt_ss(tmp).m128_f32[0];
+    }
+}
+
+//degree of linear polarization: hypot(s1,s2)/s0
+void Polarization::dolp(const size_t n, const __m128* simg, float* out) {
+    __m128 tmp;
+    float sum;
+    #pragma omp parallel for private(tmp, sum)
+    for (int i = 0; i < n; ++i) {
+        //square each element
+        tmp = _mm_mul_ps(simg[i], simg[i]);
+        //(s1^2 + s2^2)/s0^2
+        tmp.m128_f32[0] = (tmp.m128_f32[1] + tmp.m128_f32[2]) / tmp.m128_f32[0];
+        out[i] = _mm_sqrt_ss(tmp).m128_f32[0];
+    }
+}
+
+//degree of circular polarization: abs(s3)/s0
+void Polarization::docp(const size_t n, const __m128* simg, float* out){
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = fabs(simg[i].m128_f32[3] / simg[i].m128_f32[0]);
+    }
+}
+
+//angle of polarization: 0.5*atan(s2/s1)
+void Polarization::aop(const size_t n, const __m128* simg, float* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = 0.5*atan2f(simg[i].m128_f32[2], simg[i].m128_f32[1]);
+    }
+}
+
+//2x angle of polarization: atan(s2/s1)
+void Polarization::aopx2(const size_t n, const __m128* simg, float* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = atan2f(simg[i].m128_f32[2], simg[i].m128_f32[1]);
+    }
+}
+
+//ellipticity angle: 0.5*asin(s3/s0)
+void Polarization::ella(const size_t n, const __m128* simg, float* out) {
+    #pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+        out[i] = 0.5*asinf(simg[i].m128_f32[3] / simg[i].m128_f32[0]);
+    }
+}
diff --git a/polarization.hpp b/polarization.hpp
new file mode 100644
index 0000000..028f0f5
--- /dev/null
+++ b/polarization.hpp
@@ -0,0 +1,207 @@
+/**********
+ *   Copyright 2014 Samuel Bear Powell
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+\**********/
+#ifndef POLARIZATION_HPP
+#define POLARIZATION_HPP
+
+#include <xmmintrin.h> //SSE1,2
+#include <smmintrin.h> //SSE4
+#include <cstdint>
+
+//pattern encoding for 2x2 super-pixels
+//PIXn(r,c) specifies that pixel n (0,1,2,3) belongs in (row, column) = (r,c) of the super-pixel. r, c must be either 0 or 1
+#define PIX0(r,c) (((r)&1) | (((c)&1) << 1))
+#define PIXN(r,c,n) (PIX0(r,c) << (2*(n)))
+#define PIX1(r,c) PIXN(r,c,1)
+#define PIX2(r,c) PIXN(r,c,2)
+#define PIX3(r,c) PIXN(r,c,3)
+
+//pattern decoding for 2x2 super-pixels
+//PATR(pattern,n) yields 0 or 1 for which row pixel n belongs to as specified by the pattern
+//PATC(pattern,n) yields the column.
+#define PATR(p,n) (((p) >> (2*(n))) & 1)
+#define PATC(p,n) (((p) >> (2*(n)+1)) & 1)
+
+//compute an index for a row-major storage matrix
+#define IDX(r,c,cols) ((r)*(cols)+(c))
+//wrap an index 'x' from 0 to max
+#define WRAP(x,max) ((x) < 0 ? ((max) - ((max) % -(x))) : ((max) % (x)))
+//reflect an index x between 0 to max
+#define REFLECT(x,max) (WRAP((x),2*(max)) > (max) ? 2*(max)-WRAP((x),2*(max)) : WRAP((x),2*(max)))
+
+
+//#define DEFAULT_PATTERN (PIX0(0, 0) | PIX1(1, 1) | PIX2(0, 1) | PIX3(1, 0))
+
+namespace Polarization {
+
+    extern const uint8_t default_pattern;
+
+    //transpose a 4x4 matrix
+    void trans(__m128 mat[4]);
+
+    //invert a 4x4 matrix
+    void inv(const __m128 mat[4], __m128 out[4]);
+
+    //perform a dot product of a row-vector and a matrix
+    //the matrix is in column-major order!!
+    __m128 dot(const __m128 vec, const __m128 mat[4]);
+
+    //perform a dot product of a matrix with a column vector
+    //the matrix is in row-major order!!
+    __m128 dot(const __m128 mat[4], const __m128 vec);
+
+    float dot(const __m128 vec0, const __m128 vec1);
+
+    //packs superpixels into SSE registers according to pattern
+    //the pattern is specified by P0(r,c) to P3(r,c) macros
+    //P0(r,c) means that pixel 0 of the SSE register appears in row,col = r,c of the 2x2 super pixel.
+    template<typename raw_type>
+    void pack_superpixels(const size_t rows, const size_t cols, const raw_type* raw, __m128* out, uint8_t pattern = default_pattern) {
+        size_t rows_2 = rows / 2, cols_2 = cols / 2;
+        int r,c,rx2,cx2;
+        #pragma omp parallel for private(r,c,rx2,cx2)
+        for (r = 0; r < rows_2; ++r) {
+            for (c = 0; c < cols_2; ++c) {
+                rx2 = r << 1, cx2 = c << 1;
+                out[IDX(r,c,cols_2)] =
+                    _mm_setr_ps(
+                        float(raw[IDX(rx2 | PATR(pattern, 0), cx2 | PATC(pattern, 0), cols)]),
+                        float(raw[IDX(rx2 | PATR(pattern, 1), cx2 | PATC(pattern, 1), cols)]),
+                        float(raw[IDX(rx2 | PATR(pattern, 2), cx2 | PATC(pattern, 2), cols)]),
+                        float(raw[IDX(rx2 | PATR(pattern, 3), cx2 | PATC(pattern, 2), cols)])
+                    );
+            }
+        }
+    }
+
+    void unpack_superpixels(const size_t rows, const size_t cols, const __m128* packed, __m128* out, uint8_t pattern = default_pattern);
+
+    void mask_low_high(const size_t n, const __m128* raw, float low, float high, __m128* out);
+    void mask_low(const size_t n, const __m128* raw, float low, __m128* out);
+    void mask_high(const size_t n, const __m128* raw, float high, __m128* out);
+
+    //packs super-pixels and does dot(gains[i], raw[i] - darks[i]) where gains[i] is row-major
+    template<typename raw_type>
+    void calibrate_matrix(const size_t rows, const size_t cols, const raw_type* raw, const __m128* darks, const __m128* gains, __m128* out, uint8_t pattern = default_pattern) {
+        size_t rows_2 = rows / 2, cols_2 = cols / 2;
+        int r, c, rx2, cx2;
+        __m128 packed;
+        #pragma omp parallel for private(r,c,rx2,cx2,packed)
+        for (r = 0; r < rows_2; ++r) {
+            for (c = 0; c < cols_2; ++c) {
+                rx2 = r << 1, cx2 = c << 1;
+                packed =
+                    _mm_setr_ps(
+                    float(raw[IDX(rx2 | PATR(pattern, 0), cx2 | PATC(pattern, 0), cols)]),
+                    float(raw[IDX(rx2 | PATR(pattern, 1), cx2 | PATC(pattern, 1), cols)]),
+                    float(raw[IDX(rx2 | PATR(pattern, 2), cx2 | PATC(pattern, 2), cols)]),
+                    float(raw[IDX(rx2 | PATR(pattern, 3), cx2 | PATC(pattern, 2), cols)])
+                    );
+                out[IDX(r, c, cols_2)] = dot(&gains[IDX(r, c, cols_2)], _mm_sub_ps(packed, darks[IDX(r, c, cols_2)]));
+            }
+        }
+    }
+
+    //packs super-pixels does dot(raw[i] - darks[i], gains[i]) where gains[i] is column-major
+    template<typename raw_type>
+    void calibrate_matrix2(const size_t rows, const size_t cols, const raw_type* raw, const __m128* darks, const __m128* gains, __m128* out, uint8_t pattern = default_pattern) {
+        size_t rows_2 = rows / 2, cols_2 = cols / 2;
+        int r, c, rx2, cx2;
+        __m128 packed;
+        #pragma omp parallel for private(r,c,rx2,cx2,packed)
+        for (r = 0; r < rows_2; ++r) {
+            for (c = 0; c < cols_2; ++c) {
+                rx2 = r << 1, cx2 = c << 1;
+                packed =
+                    _mm_setr_ps(
+                    float(raw[IDX(rx2 | PATR(pattern, 0), cx2 | PATC(pattern, 0), cols)]),
+                    float(raw[IDX(rx2 | PATR(pattern, 1), cx2 | PATC(pattern, 1), cols)]),
+                    float(raw[IDX(rx2 | PATR(pattern, 2), cx2 | PATC(pattern, 2), cols)]),
+                    float(raw[IDX(rx2 | PATR(pattern, 3), cx2 | PATC(pattern, 2), cols)])
+                    );
+                out[IDX(r, c, cols_2)] = dot(_mm_sub_ps(packed, darks[IDX(r, c, cols_2)]), gains[IDX(r, c, cols_2)]);
+            }
+        }
+    }
+
+    //does dot(gains[i], raw[i] - darks[i]) where gains[i] is row-major
+    void calibrate_matrix(const size_t n, const __m128* raw, const __m128* darks, const __m128* gains, __m128* out);
+
+    //does dot(raw[i] - darks[i], gains[i]) where gains[i] is column-major
+    void calibrate_matrix2(const size_t n, const __m128* raw, const __m128* darks, const __m128* gains, __m128* out);
+
+    struct edge_mode {
+        enum t {
+            ZERO,
+            WRAP,
+            REFLECT
+        };
+    };
+
+    //2D filtering (with 2D kernel)
+    void filter(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt, const edge_mode::t mode);
+    void filter_zero(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt);
+    void filter_wrap(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt);
+    void filter_reflect(const size_t rows, const size_t cols, const __m128* in, __m128* out, const size_t filt_rows, const size_t filt_cols, const __m128* filt);
+
+    //2D filtering (with 1D kernel applied on both axes)
+    void filter(const size_t rows, const size_t cols, const __m128 *in, __m128 *out, const size_t filt_size, const __m128 *filt, const edge_mode::t mode);
+    void filter_zero(const size_t rows, const size_t cols, const __m128 *in, __m128 *out, const size_t filt_size, const __m128 *filt);
+    void filter_wrap(const size_t rows, const size_t cols, const __m128 *in, __m128 *out, const size_t filt_size, const __m128 *filt);
+    void filter_reflect(const size_t rows, const size_t cols, const __m128 *in, __m128 *out, const size_t filt_size, const __m128 *filt);
+
+    //does dot(R[i], img[i]) where R[i] is row-major
+    void stokes(const size_t n, const __m128* img, const __m128* R, __m128* out);
+
+    //does dot(R, img[i]) where R is row-major
+    void stokesR(const size_t n, const __m128* img, const __m128 R[4], __m128* out);
+
+    //does dot(img[i], R[i]) where R[i] is column-major
+    void stokes2(const size_t n, const __m128* img, const __m128*  R, __m128* out);
+
+    //does dot(img[i], R) where R is column-major
+    void stokes2R(const size_t n, const __m128* img, const __m128  R[4], __m128* out);
+
+    //assumes 0,90,45,135 pixel pattern:
+    void stokes(const size_t n, const __m128* img, __m128* out);
+
+    //extract a single element from the __m128
+    void element(const size_t param, const size_t n, const __m128* simg, float* out);
+
+    //degree of polarization: hypot(s1,s2,s3)/s0
+    void dop(const size_t n, const __m128* simg, float* out);
+
+    //degree of linear polarization: hypot(s1,s2)/s0
+    void dolp(const size_t n, const __m128* simg, float* out);
+
+    //degree of circular polarization: abs(s3)/s0
+    void docp(const size_t n, const __m128* simg, float* out);
+
+    //angle of polarization: 0.5*atan(s2/s1)
+    void aop(const size_t n, const __m128* simg, float* out);
+
+    //2x angle of polarization: atan(s2/s1)
+    void aopx2(const size_t n, const __m128* simg, float* out);
+
+    //ellipticity angle: 0.5*asin(s3/s0)
+    void ella(const size_t n, const __m128* simg, float* out);
+
+    //TODO:
+    //decompose stokes vector?
+
+}
+
+
+#endif // POLARIZATION_HPP