platypusllc · shawnhanna · Nov 27, 2016 · Nov 27, 2016 · Nov 27, 2016 · Nov 27, 2016
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,6 @@ docs/_build/
 
 # PyBuilder
 target/
+
+# IPython
+.ipynb_checkpoints
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ $ pip install git+https://github.com/platypusllc/analytics.git
 
 * **API documentation** can be found on [ReadTheDocs](http://platypus-analytics.readthedocs.org/en/latest/).
 * **Usage examples** of this library can be found in the [examples](examples) directory.
+* **IPython/Jupyter notebooks** using this library can be found in the [notebooks](notebooks) directory.
 
 [1]: http://docs.python-guide.org/en/latest/dev/virtualenvs/
 [2]: https://www.continuum.io/documentation

diff --git a/examples/EC_zero_trim.py b/examples/EC_zero_trim.py
@@ -0,0 +1,197 @@
+import collections
+import platypus.io.logs
+import platypus.util.conversions
+import numpy as np
+import datetime
+import json
+import six
+import re
+import pandas
+import glob
+
+
+# FILE TO TEST JAR DATA EXTRACTION
+PATH = "/home/jason/Documents/INTCATCH/phone logs/Gardaland outlet/2017-10-3/"
+FILE = "platypus_20171003_050016"
+EXT = ".txt"
+
+# FILES TO TEST MERGING
+PATH2 = "/home/jason/Documents/INTCATCH/phone logs/Gardaland outlet/2017-10-4/"
+FILE1 = "platypus_20171004_040203"
+FILE2 = "platypus_20171004_054619"
+
+"""
+def trim_EC():
+    global PATH, FILE
+    print "\nLoading all the data in " + PATH + FILE + "\n"
+    data = platypus.io.logs.load(PATH + FILE)
+    if "ES2" in data:
+        print "ES2 sensor is present. Trimming all data within EC < 100 time windows\n"
+        # find all time windows where EC is exactly 0
+        ES2_data = data["ES2"]
+        values = ES2_data["ec"].values
+        ec_eq_zero_indices = np.where(values < 100)[0]
+        windows = list()
+        windows.append([ec_eq_zero_indices[0]])
+        left = ec_eq_zero_indices[0]
+        for ii in range(1, ec_eq_zero_indices.shape[0]):
+            i = ec_eq_zero_indices[ii]
+            if i - left > 5:
+                # there has been a jump in index, a new time window has started
+                windows[-1].append(left)
+                windows.append([i])
+            left = i
+        windows[-1].append(ec_eq_zero_indices[-1])
+        # print ec_eq_zero_indices
+        # print windows
+        for window in windows:
+            time_window = [ES2_data["ec"].index.values[window[0]], ES2_data["ec"].index.values[window[1]]]
+            for k in data.keys():
+                data[k] = data[k].loc[np.logical_or(data[k].index < time_window[0], data[k].index > time_window[1])]
+
+    else:
+        print "No ES2 sensor present. No trimming will be performed."
+
+    # do stuff with data
+"""
+
+
+def merge_files(filename_list):
+    """
+
+    :param: filename_list: list of full path filename strings
+    :return: One result will all the dataframes merged
+    :rtype: {str: pandas.DataFrame}
+    """
+    logfile_result_list = [platypus.io.logs.load(filename) for filename in filename_list]
+    if len(logfile_result_list) == 1:
+        return logfile_result_list[0]
+    all_data_types = set()
+    for i in range(1, len(logfile_result_list)):
+        all_data_types = all_data_types.union(set(logfile_result_list[i].keys()))
+    print all_data_types
+
+    # merged_dataframe = pandas.DataFrame.merge(merged_dataframe[data_type], dataframe_list[i][data_type], how='outer')
+    merged_dataframe_dict = dict()
+
+    for data_type in all_data_types:
+        for i in range(len(logfile_result_list)):
+            if data_type in logfile_result_list[i]:
+                first_log_index = i
+                break
+        merged_dataframe_dict[data_type] = logfile_result_list[first_log_index][data_type]
+        for i in range(first_log_index + 1, len(logfile_result_list)):
+            if data_type in logfile_result_list[i]:
+                merged_dataframe_dict[data_type] = merged_dataframe_dict[data_type].combine_first(logfile_result_list[i][data_type]).dropna(how='any')
+    return merged_dataframe_dict
+
+
+def trim_using_EC(dataframe, threshold=100):
+    """
+    Trims any data when EC < 100
+    :return: trimmed dataframe
+    """
+    if "ES2" in dataframe:
+        print "ES2 sensor is present. Trimming all data within EC < {:.0f} time windows\n".format(threshold)
+        # find all time windows where EC is exactly 0
+        ES2_data = dataframe["ES2"]
+        values = ES2_data["ec"].values
+        ec_eq_zero_indices = np.where(values < threshold)[0]
+        windows = list()
+        windows.append([ec_eq_zero_indices[0]])
+        left = ec_eq_zero_indices[0]
+        for ii in range(1, ec_eq_zero_indices.shape[0]):
+            i = ec_eq_zero_indices[ii]
+            if i - left > 5:
+                # there has been a jump in index, a new time window has started
+                windows[-1].append(left)
+                windows.append([i])
+            left = i
+        windows[-1].append(ec_eq_zero_indices[-1])
+        # print ec_eq_zero_indices
+        # print windows
+        for window in windows:
+            time_window = [ES2_data["ec"].index.values[window[0]], ES2_data["ec"].index.values[window[1]]]
+            for k in dataframe:
+                dataframe[k] = dataframe[k].loc[np.logical_or(dataframe[k].index < time_window[0], dataframe[k].index > time_window[1])]
+    else:
+        print "No ES2 sensor present. No trimming will be performed."
+    return dataframe
+
+
+def data_with_sampler(filename):
+    data = platypus.io.logs.load(filename)
+    is_EC_gt_100 = False
+
+    jar_start_timestamps = dict()
+    with open(filename, 'r') as logfile:
+        raw_data = collections.defaultdict(list)
+        start_time = datetime.datetime.utcfromtimestamp(0)
+
+        for line in logfile:
+            # Extract each line fron the logfile and convert the timestamp.
+            time_offset_ms, level, message = line.split('\t', 2)
+
+            # Compute the timestamp for each log entry.
+            time_offset = datetime.timedelta(milliseconds=int(time_offset_ms))
+            timestamp = start_time + time_offset
+
+            # Try to parse the log as a JSON object.
+            try:
+                entry = json.loads(message)
+            except ValueError as e:
+                raise ValueError(
+                    "Aborted after invalid JSON log message '{:s}': {:s}".format(message, e))
+
+            # If the line is a datetime, compute subsequent timestamps from this.
+            # We assume that "date" and "time" are always together in the entry.
+            if 'date' in entry:
+                timestamp = datetime.datetime.utcfromtimestamp(entry['time'] / 1000.)
+                start_time = timestamp - time_offset
+
+            # Extract appropriate data from each entry.
+            for k, v in six.viewitems(entry):
+                if k == 'sensor':
+                    if v['type'] == "ES2":
+                        ec = v['data'][0]
+                        if not is_EC_gt_100 and ec > 100:
+                            is_EC_gt_100 = True
+                        if is_EC_gt_100 and ec < 100:
+                            is_EC_gt_100 = False
+                if k == 'sampler' and is_EC_gt_100:
+                    if "start" in v:
+                        # the in-water sampler start messages
+                        m = re.search('[0-9]+', v)
+                        jar_id = m.group(0)
+                        jar_start_timestamps[jar_id] = timestamp
+
+                        # TODO: MUST MERGE IN THE LATITUDE AND LONGITUDE!!!
+
+    return data, jar_start_timestamps
+
+
+def extract_sampler_data_by_jar():
+    global PATH, FILE, EXT
+    filename = PATH + FILE + EXT
+    data, jar_start_timestamps = data_with_sampler(filename)
+    trimmed_data = trim_using_EC(data)
+    for k in jar_start_timestamps:
+        start_time = jar_start_timestamps[k]
+        end_time = start_time + datetime.timedelta(minutes=3.75)
+        print "Jar {} lasts from {} to {}".format(k, start_time, end_time)
+        for sensor in data.keys():
+            print sensor
+            if sensor not in ["ES2", "ATLAS_DO", "ATLAS_PH"]:
+                continue
+            dataframe = trimmed_data[sensor]
+            relevantframe = dataframe.between_time(start_time.time(), end_time.time())
+            output_filename = PATH + FILE + "__JAR_{}".format(k) + "__{}".format(sensor) + ".csv"
+            relevantframe.to_csv(output_filename)
+
+
+if __name__ == "__main__":
+    merged_data = merge_files(glob.glob("/home/shawn/day2/*.txt"))
+
+
+
+
diff --git a/examples/histogram-insitu.py b/examples/histogram-insitu.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+"""
+Example of loading Platypus vehicle logs from file.
+
+Data is loaded as time series into Pandas[1] DataFrames, from which they can
+be interpolated and filtered for other purposes, or used as input into Numpy[2]
+or Scipy[3] functions.
+
+[1]: http://pandas.pydata.org/
+[2]: http://www.numpy.org/
+[3]: http://www.scipy.org/
+"""
+import matplotlib.pyplot as plt
+import platypus.io.insitu_logs
+import platypus.util.conversions
+import glob
+import os
+import numpy as np
+import math
+
+# Read the data log from file.
+# Note: for log versions <5.0, this filename must be 'airboat_[timestamp].txt].
+
+def trim_using_EC(dataframe, threshold=100):
+    """
+    Trims any data when EC < 100
+    :return: trimmed dataframe
+    """
+    if "ES2" in dataframe:
+        print("ES2 sensor is present. Trimming all data within EC < {:.0f} time windows\n".format(threshold))
+        # find all time windows where EC is exactly 0
+        ES2_data = dataframe["ES2"]
+        values = ES2_data["ec"].values
+        ec_eq_zero_indices = np.where(values < threshold)[0]
+        windows = list()
+        windows.append([ec_eq_zero_indices[0]])
+        left = ec_eq_zero_indices[0]
+        for ii in range(1, ec_eq_zero_indices.shape[0]):
+            i = ec_eq_zero_indices[ii]
+            if i - left > 5:
+                # there has been a jump in index, a new time window has started
+                windows[-1].append(left)
+                windows.append([i])
+            left = i
+        windows[-1].append(ec_eq_zero_indices[-1])
+        # print(ec_eq_zero_indices)
+        # print(windows)
+        for window in windows:
+            time_window = [ES2_data["ec"].index.values[window[0]], ES2_data["ec"].index.values[window[1]]]
+            for k in dataframe:
+                dataframe[k] = dataframe[k].loc[np.logical_or(dataframe[k].index < time_window[0], dataframe[k].index > time_window[1])]
+    else:
+        print("No ES2 sensor present. No trimming will be performed.")
+    return dataframe
+
+
+def load_data(folders = [], files = [], ec_trim_value = 50):
+    if (len(folders) == 0):
+        folders = [sys.path.cwd]
+
+    # print("folders", str(folders))
+
+    for folder in folders:
+        files.extend(glob.glob(folder+'/*fixed*.csv'))
+
+    # print("log files: " + str(files))
+    # todo: remove duplicates?
+
+    data = platypus.io.insitu_logs.merge_files(files)
+    data = trim_using_EC(data, ec_trim_value)
+    return data
+
+def plot_hist_sensor(data, sensor = 'ES2', num_bins = 10, hide_top_n_percent = 0, hide_bot_n_percent = 0, save_dir = "~/save"):
+    num_readings = len(data[sensor])
+    # Get the std of the data
+    sensor_stddev = data[sensor].std()
+    # Get the mean of the data
+    sensor_mean = data[sensor].mean()
+    # Get the min of the data
+    sensor_min = data[sensor].min()
+    # Get the max of the data
+    sensor_max = data[sensor].max()
+
+    print(sensor+" number of readings", num_readings)
+    print(sensor+" std", sensor_stddev)
+    print(sensor+" mean", sensor_mean)
+    print(sensor+" min", sensor_min)
+    print(sensor+" max", sensor_max)
+
+    hist_max = math.ceil(sensor_max - hide_top_n_percent * 0.01 * sensor_max)
+    hist_min = math.floor(sensor_min + hide_bot_n_percent * 0.01 * sensor_min)
+    bin_size = (hist_max - hist_min)/float(num_bins)
+
+    bins = np.arange(hist_min, hist_max, bin_size)
+    # print(hist_max, hist_min, bin_size, bins)
+
+    # n, bins, patches = plt.hist(data[sensor], bins=xrange(200,1600,100))
+    weights = np.ones_like(data[sensor])/float(num_readings) * 100
+    if (num_bins <= 0):
+        n, bins, patches = plt.hist(data[sensor], weights=weights)
+    else:
+        n, bins, patches = plt.hist(data[sensor], weights=weights, bins=bins)
+
+    # print(n, bins, patches)
+
+    plt.xlabel(sensor)
+    plt.ylabel('Percentage of values in the given range')
+    plt.ylim(0,50)
+    plt.title('Histogram of ' + sensor + ' ' + save_dir.split('/')[-1])
+    plt.savefig(save_dir + "/"+'Histogram of ' + sensor + ' ' + save_dir.split('/')[-1]+'.png')
+    # plt.text(0, .25, "Standard Dev: " + str(es2_stddev))
+    plt.figtext(.16, .75, "Mean: " + str(sensor_mean))
+    plt.figtext(.16, .7, "std: " + str(sensor_stddev))
+    plt.grid(True)
+    plt.show()
+
+def get_folders():
+    # folders = ['/home/shawn/NL2/grokeneveldse_polder/grokeneveldse_polder_feb_2018/']
+    folders = ['/home/shawn/data/june 18 2018 - NL delfgauw/day_1', '/home/shawn/data/june 18 2018 - NL delfgauw/day_2']
+    return folders
+
+def main():
+    print("enter EC trim value: ")
+    new_ec_trim = int(raw_input())
+
+    folders = get_folders()
+
+    data = load_data(folders=folders, ec_trim_value = new_ec_trim)
+    print(data)
+    print("data columns: ", data.keys)
+
+    num_bins = 10
+    hide_bot_n_percent = 10
+    hide_top_n_percent = 10
+    while ( True ):
+        print("what would you like to do?")
+        print("0: "+"change number of bins (currently: " +str(num_bins)+")")
+        print("1: "+"change percentage of bottom to hide (currently: " +str(hide_bot_n_percent)+")")
+        print("2: "+"change percentage of top to hide (currently: " +str(hide_top_n_percent)+")")
+        for i,x in enumerate(data.keys()):
+            print(i+3, ": plot " + x)
+
+        command = raw_input()
+
+        if (command == '0'):
+            break
+        elif (command == '1'):
+            num_bins = int(command)
+        elif (command == '2'):
+            hide_bot_n_percent = int(command)
+        elif (command == '3'):
+            hide_top_n_percent = int(command)
+        elif(int(command) < len(data.keys()) ):
+            plot_hist_sensor(data, data.keys()[int(command)], num_bins = num_bins, hide_top_n_percent = hide_top_n_percent, hide_bot_n_percent=hide_bot_n_percent, save_dir = folders[0])
+        else:
+            print(command +" is not valid")
+
+
+if __name__ == '__main__':
+    main()