diff --git a/README.md b/README.md index 925f400..532ede5 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ | [![doc](https://img.shields.io/badge/docs-dev-blue.svg)](https://zavalab.github.io/PlasmoData.jl/dev) | [![build](https://github.com/zavalab/PlasmoData.jl/actions/workflows/ci.yml/badge.svg)](https://github.com/zavalab/PlasmoData.jl/actions) | [![codecov](https://codecov.io/gh/zavalab/PlasmoData.jl/branch/main/graph/badge.svg?token=LZJ3T1XQZ0)](https://app.codecov.io/gh/zavalab/PlasmoData.jl) | -PlasmoData.jl is a package for [Julia](https://julialang.org/) designed for representing and modeling data as graphs and for building graph models that contain large amounts of data on the nodes or edges of the graph. This package also has an accompanying package [DataGraphPlots.jl](https://github.com/zavalab/DataGraphPlots.jl) which can be used for plotting the graphs. +PlasmoData.jl is a package for [Julia](https://julialang.org/) designed for representing and modeling data as graphs and for building graph models that contain large amounts of data on the nodes or edges of the graph. This package also has an accompanying package [PlasmoDataPlots.jl](https://github.com/zavalab/PlasmoDataPlots.jl) which can be used for plotting the graphs. + +PlasmoData.jl is built on the abstraction called a `DataGraph`. The manuscript ["PlasmoData.jl -- A Julia Framework for Modeling and Analyzing Complex Data as Graphs"](https://arxiv.org/abs/2401.11404) details the abstraction and this package. ## Bug Reports and Support diff --git a/examples/basic_functions.jl b/examples/basic_functions.jl index c8a9d75..32c07ec 100644 --- a/examples/basic_functions.jl +++ b/examples/basic_functions.jl @@ -1,6 +1,6 @@ using Revise using PlasmoData, Graphs -using DataGraphPlots +using PlasmoDataPlots dg = DataGraph() @@ -28,4 +28,4 @@ add_edge_data!(dg, "node4", 1, 1.0, "weight") add_edge_data!(dg, :node5, 2, -.00001, "weight") add_edge_data!(dg, 3, "node4", 1, "weight") -DataGraphPlots.plot_graph(dg; xdim = 400, ydim = 400) +PlasmoDataPlots.plot_graph(dg; xdim = 400, ydim = 400) diff --git a/examples/edge_weighted_EC.jl b/examples/edge_weighted_EC.jl index d7ec7b9..1542574 100644 --- a/examples/edge_weighted_EC.jl +++ b/examples/edge_weighted_EC.jl @@ -2,7 +2,7 @@ using Revise using PlasmoData, Graphs using JLD, LinearAlgebra using Plots, Statistics -using DataGraphPlots +using PlasmoDataPlots # Data for this example comes from Alex Smith's paper on the Euler Characteristic: # https://doi.org/10.1016/j.compchemeng.2021.107463 diff --git a/examples/matrix_to_graph.jl b/examples/matrix_to_graph.jl index 6c9d43d..34ead8d 100644 --- a/examples/matrix_to_graph.jl +++ b/examples/matrix_to_graph.jl @@ -1,6 +1,6 @@ using Revise using PlasmoData, Graphs -using DataGraphPlots +using PlasmoDataPlots mat = rand(10, 10) diff --git a/examples/tensor_to_graph.jl b/examples/tensor_to_graph.jl index 3764e15..ea166c3 100644 --- a/examples/tensor_to_graph.jl +++ b/examples/tensor_to_graph.jl @@ -1,7 +1,7 @@ using Revise using PlasmoData, Graphs using Statistics, DelimitedFiles -using DataGraphPlots +using PlasmoDataPlots abc = rand(10, 4, 5) diff --git a/src/datadigraphs/utils.jl b/src/datadigraphs/utils.jl index 78b9495..8c1c04c 100644 --- a/src/datadigraphs/utils.jl +++ b/src/datadigraphs/utils.jl @@ -1,19 +1,19 @@ """ filter_nodes(datadigraph, filter_value, attribute = dg.node_data.attributes[1]; fn = isless) -Removes the nodes of the graph whose weight value of `attribute` is greater than the given -`filter_value`. If `attribute` is not specified, this defaults to the first attribute within -the DataGraph's `NodeData`. +Removes the nodes of the graph whose data on `attribute` is does not meet the criteria of `fn` +with respect to `filter_value`. If `attribute` is not specified, this defaults to the first +attribute within the DataDiGraph's `NodeData`. -`fn` is a function that takes an input of two scalar values and is broadcast to the data vector. -For example, isless, isgreater, isequal +`fn` is a function that takes an input of a node's data on attribute and the `filter_value` +and returns a true or false """ function filter_nodes( dg::DataDiGraph, filter_val::R, attribute::String=dg.node_data.attributes[1]; fn::Function = isless -) where {R <: Real} +) where {R <: Any} node_attributes = dg.node_data.attributes edge_attributes = dg.edge_data.attributes @@ -27,7 +27,7 @@ function filter_nodes( edges = dg.edges if length(node_attributes) == 0 - error("No node weights are defined") + error("No node data are defined") end T = eltype(dg) @@ -107,19 +107,19 @@ end """ filter_edges(datadigraph, filter_value, attribute = dg.edge-data.attributes[1]; fn = isless) -Removes the edges of the graph whose weight value of `attribute` is greater than the given -`filter_value`. If `attribute` is not specified, this defaults to the first attribute within -the DataGraph's `EdgeData`. +Removes the edges of the graph whose data on `attribute` is does not meet the criteria of `fn` +with respect to `filter_value`. If `attribute` is not specified, this defaults to the first +attribute within the DataDiGraph's `EdgeData`. -`fn` is a function that takes an input of two scalar values and is broadcast to the data vector. -For example, isless, isgreater, isequal +`fn` is a function that takes an input of a edge's data on attribute and the `filter_value` +and returns a true or false """ function filter_edges( dg::DataDiGraph, filter_val::R, attribute::String = dg.edge_data.attributes[1]; fn::Function = isless -) where {R <: Real} +) where {R <: Any} nodes = dg.nodes edges = dg.edges @@ -132,7 +132,7 @@ function filter_edges( edge_attribute_map = dg.edge_data.attribute_map if length(edge_attributes) == 0 - error("No node weights are defined") + error("No edge data are defined") end T = eltype(dg) @@ -353,7 +353,10 @@ function remove_edge!( end """ - aggregate(datadigraph, node_list, aggregated_node_name; node_fn = mean, edge_fn = mean) + aggregate(datadigraph, node_list, aggregated_node_name; + node_fn = mean, edge_fn = mean, save_agg_edge_data = false, + agg_edge_fn = mean, agg_edge_val = 0, node_attributes_to_add = String[] + ) Aggregates all the nodes in `node_list` into a single node which is called `aggregated_node_name`. If nodes have any weight/attribute values defined, these values are combined via the `node_fn` function. @@ -361,14 +364,25 @@ The default for `node_fn` is Statistics.mean which averages the data for the nod Edge data are also are also combined via the `edge_fn` when two or more nodes in the `node_list` are connected to the same node and these edges have data defined on them. The `edge_fn` also defaults to `Statistics.mean` + +If edges exist between nodes in `node_list`, the data on these edges can optionally be saved on +the `aggregated_node_name` node by setting `save_agg_edge_data = true`. If true, then the edge data +on these edges is aggregated using `agg_edge_fn`. If the user wants to define new attribute names for +this data, they can pass a vector to `node_attributes_to_add`; if no vector is defined, the data will +be aggregated under the names of the `edge_data` attributes. All other nodes except the aggregated +nodes will have these attributes initialized as `agg_edge_val`. """ function aggregate( dg::DataDiGraph, node_set::Vector, new_name::N; node_fn::Function = _default_mean, - edge_fn::Function = _default_mean -) where {N <: Any} + edge_fn::Function = _default_mean, + save_agg_edge_data::Bool = false, + agg_edge_fn::Function = _default_mean, + agg_edge_val::R = 0., + node_attributes_to_add::Vector{String} = String[] +) where {N <: Any, R <: Any} nodes = dg.nodes node_map = dg.node_map @@ -389,6 +403,26 @@ function aggregate( error("New node name already exists in set of non-aggregated nodes") end + if save_agg_edge_data + if length(dg.edge_data.attributes) > 0 && length(node_attributes_to_add) > 0 + if length(dg.edge_data.attributes) != length(node_attributes_to_add) + error("Length of the node_attributes_to_add does not match the edge_data attributes") + end + for i in 1:length(node_attributes_to_add) + if node_attributes_to_add[i] in node_attributes + error("Attribute name $(node_attributes_to_add[i]) is already defined in node_attributes") + end + end + elseif length(dg.edge_data.attributes) > 0 + attribute_names = dg.edge_data.attributes + for i in 1:length(attribute_names) + if attribute_names[i] in node_attributes + error("Edge data attribute names conflict with node data attributes; user must pass node_attributes_to_add") + end + end + end + end + T = eltype(dg) T1 = eltype(get_node_data(dg)) M1 = typeof(get_node_data(dg)) @@ -445,6 +479,8 @@ function aggregate( edge_bool_avg_index = Dict{Tuple{T, T}, Vector{T}}() new_edge_data = fill(0, (0, length(edge_attributes))) + removed_edge_bool_vec = [false for i in 1:length(edges)] + for i in 1:length(nodes) node_name_mapping[node_map[nodes[i]]] = nodes[i] end @@ -532,6 +568,10 @@ function aggregate( end end end + else + if save_agg_edge_data + removed_edge_bool_vec[i] = true + end end end @@ -548,6 +588,44 @@ function aggregate( new_dg.edge_data.attributes = copy(edge_attributes) new_dg.edge_data.attribute_map = copy(edge_attribute_map) new_dg.edge_data.data = copy(new_edge_data) + + if save_agg_edge_data + new_node_data = new_dg.node_data.data + new_node_attributes = new_dg.node_data.attributes + new_node_attribute_map = new_dg.node_data.attribute_map + if length(node_attributes) > 0 + edge_data_to_avg = edge_data[removed_edge_bool_vec, :] + if length(node_attributes_to_add) > 0 + attributes_to_add = node_attributes_to_add + else + attributes_to_add = edge_attributes + end + for j in 1:length(attributes_to_add) + push!(new_node_attributes, attributes_to_add[j]) + new_node_attribute_map[attributes_to_add[j]] = length(new_node_attributes) + end + data_to_add = fill(agg_edge_val, (length(new_nodes), length(edge_attributes))) + data_to_add[length(new_nodes), :] .= agg_edge_fn(edge_data_to_avg) + old_data = new_node_data + new_dg.node_data.data = hcat(old_data, data_to_add) + else + edge_data_to_avg = edge_data[removed_edge_bool_vec, :] + if length(node_attributes_to_add) > 0 + attributes_to_add = node_attributes_to_add + else + attributes_to_add = edge_attributes + end + new_dg.node_data.attributes = attributes_to_add + new_node_attribute_map = new_dg.node_data.attribute_map + for j in 1:length(attributes_to_add) + new_node_attribute_map[attributes_to_add[j]] = j + end + data_to_add = fill(agg_edge_val, (length(new_nodes), length(edge_attributes))) + data_to_add[length(new_nodes), :] .= agg_edge_fn(edge_data_to_avg) + old_data = zeros(T1, (length(new_nodes), 0)) + new_dg.node_data.data = hcat(old_data, data_to_add) + end + end end simple_digraph = Graphs.SimpleDiGraph(T(length(new_edges)), fadjlist, badjlist) diff --git a/src/datagraphs/utils.jl b/src/datagraphs/utils.jl index 1cef805..da76c41 100644 --- a/src/datagraphs/utils.jl +++ b/src/datagraphs/utils.jl @@ -390,19 +390,19 @@ end """ filter_nodes(datagraph, filter_value, attribute = dg.node_data.attributes[1]; fn = isless) -Removes the nodes of the graph whose weight value of `attribute` is greater than the given -`filter_value`. If `attribute` is not specified, this defaults to the first attribute within -the DataGraph's `NodeData`. +Removes the nodes of the graph whose data on `attribute` is does not meet the criteria of `fn` +with respect to `filter_value`. If `attribute` is not specified, this defaults to the first +attribute within the DataGraph's `NodeData`. -`fn` is a function that takes an input of two scalar values and is broadcast to the data vector. -For example, isless, isgreater, isequal +`fn` is a function that takes an input of a node's data on attribute and the `filter_value` +and returns a true or false """ function filter_nodes( dg::DataGraph, filter_val::R, attribute::String=dg.node_data.attributes[1]; fn::Function = isless -) where {R <: Real} +) where {R <: Any} node_attributes = dg.node_data.attributes edge_attributes = dg.edge_data.attributes @@ -415,7 +415,7 @@ function filter_nodes( edges = dg.edges if length(node_attributes) == 0 - error("No node weights are defined") + error("No node data are defined") end T = eltype(dg) @@ -498,19 +498,19 @@ end """ filter_edges(datagraph, filter_value, attribute = dg.edge_data.attributes[1]; fn = isless) -Removes the edges of the graph whose weight value of `attribute` is greater than the given -`filter_value`. If `attribute` is not specified, this defaults to the first attribute within -the DataGraph's `EdgeData`. +Removes the edges of the graph whose data on `attribute` is does not meet the criteria of `fn` +with respect to `filter_value`. If `attribute` is not specified, this defaults to the first +attribute within the DataGraph's `EdgeData`. -`fn` is a function that takes an input of two scalar values and is broadcast to the data vector. -For example, isless, isgreater, isequal +`fn` is a function that takes an input of a edge's data on attribute and the `filter_value` +and returns a true or false """ function filter_edges( dg::DataGraph, filter_val::R, attribute::String = dg.edge_data.attributes[1]; fn::Function = isless -) where {R <: Real} +) where {R <: Any} nodes = dg.nodes edges = dg.edges @@ -522,7 +522,7 @@ function filter_edges( edge_attribute_map = dg.edge_data.attribute_map if length(edge_attributes) == 0 - error("No node weights are defined") + error("No edge data are defined") end T = eltype(dg) @@ -814,7 +814,10 @@ function remove_edge!( end """ - aggregate(datagraph, node_list, aggregated_node_name; node_fn = mean, edge_fn = mean) + aggregate(datagraph, node_list, aggregated_node_name; + node_fn = mean, edge_fn = mean, save_agg_edge_data = false, + agg_edge_fn = mean, agg_edge_val = 0, node_attributes_to_add = String[] + ) Aggregates all the nodes in `node_list` into a single node which is called `aggregated_node_name`. If nodes have any weight/attribute values defined, these values are combined via the `node_fn` function. @@ -822,14 +825,25 @@ The default for `node_fn` is Statistics.mean which averages the data for the nod Edge data are also are also combined via the `edge_fn` when two or more nodes in the `node_list` are connected to the same node and these edges have data defined on them. The `edge_fn` also defaults to `Statistics.mean` + +If edges exist between nodes in `node_list`, the data on these edges can optionally be saved on +the `aggregated_node_name` node by setting `save_agg_edge_data = true`. If true, then the edge data +on these edges is aggregated using `agg_edge_fn`. If the user wants to define new attribute names for +this data, they can pass a vector to `node_attributes_to_add`; if no vector is defined, the data will +be aggregated under the names of the `edge_data` attributes. All other nodes except the aggregated +nodes will have these attributes initialized as `agg_edge_val`. """ function aggregate( dg::DataGraph, node_set::Vector, new_name::N; node_fn::Function = _default_mean, - edge_fn::Function = _default_mean -) where {N <: Any} + edge_fn::Function = _default_mean, + save_agg_edge_data::Bool = false, + agg_edge_fn::Function = _default_mean, + agg_edge_val::R = 0., + node_attributes_to_add::Vector{String} = String[] +) where {N <: Any, R <: Any} nodes = dg.nodes node_map = dg.node_map node_data = dg.node_data.data @@ -849,6 +863,26 @@ function aggregate( error("New node name already exists in set of non-aggregated nodes") end + if save_agg_edge_data + if length(dg.edge_data.attributes) > 0 && length(node_attributes_to_add) > 0 + if length(dg.edge_data.attributes) != length(node_attributes_to_add) + error("Length of the node_attributes_to_add does not match the edge_data attributes") + end + for i in 1:length(node_attributes_to_add) + if node_attributes_to_add[i] in node_attributes + error("Attribute name $(node_attributes_to_add[i]) is already defined in node_attributes") + end + end + elseif length(dg.edge_data.attributes) > 0 + attribute_names = dg.edge_data.attributes + for i in 1:length(attribute_names) + if attribute_names[i] in node_attributes + error("Edge data attribute names conflict with node data attributes; user must pass node_attributes_to_add") + end + end + end + end + T = eltype(dg) T1 = eltype(get_node_data(dg)) M1 = typeof(get_node_data(dg)) @@ -903,6 +937,8 @@ function aggregate( edge_bool_avg_index = Dict{Tuple{T, T}, Vector{T}}() new_edge_data = fill(0, (0, length(edge_attributes))) + removed_edge_bool_vec = [false for i in 1:length(edges)] + for i in 1:length(nodes) node_name_mapping[node_map[nodes[i]]] = nodes[i] end @@ -989,6 +1025,10 @@ function aggregate( end end end + else + if save_agg_edge_data + removed_edge_bool_vec[i] = true + end end end @@ -1005,6 +1045,44 @@ function aggregate( new_dg.edge_data.attributes = copy(edge_attributes) new_dg.edge_data.attribute_map = copy(edge_attribute_map) new_dg.edge_data.data = copy(new_edge_data) + + if save_agg_edge_data + new_node_data = new_dg.node_data.data + new_node_attributes = new_dg.node_data.attributes + new_node_attribute_map = new_dg.node_data.attribute_map + if length(node_attributes) > 0 + edge_data_to_avg = edge_data[removed_edge_bool_vec, :] + if length(node_attributes_to_add) > 0 + attributes_to_add = node_attributes_to_add + else + attributes_to_add = edge_attributes + end + for j in 1:length(attributes_to_add) + push!(new_node_attributes, attributes_to_add[j]) + new_node_attribute_map[attributes_to_add[j]] = length(new_node_attributes) + end + data_to_add = fill(agg_edge_val, (length(new_nodes), length(edge_attributes))) + data_to_add[length(new_nodes), :] .= agg_edge_fn(edge_data_to_avg) + old_data = new_node_data + new_dg.node_data.data = hcat(old_data, data_to_add) + else + edge_data_to_avg = edge_data[removed_edge_bool_vec, :] + if length(node_attributes_to_add) > 0 + attributes_to_add = node_attributes_to_add + else + attributes_to_add = edge_attributes + end + new_dg.node_data.attributes = attributes_to_add + new_node_attribute_map = new_dg.node_data.attribute_map + for j in 1:length(attributes_to_add) + new_node_attribute_map[attributes_to_add[j]] = j + end + data_to_add = fill(agg_edge_val, (length(new_nodes), length(edge_attributes))) + data_to_add[length(new_nodes), :] .= agg_edge_fn(edge_data_to_avg) + old_data = zeros(T1, (length(new_nodes), 0)) + new_dg.node_data.data = hcat(old_data, data_to_add) + end + end end simple_graph = Graphs.SimpleGraph(T(length(new_edges)), fadjlist) diff --git a/test/DataDiGraph_utils_test.jl b/test/DataDiGraph_utils_test.jl index bb85f34..db5e5e7 100644 --- a/test/DataDiGraph_utils_test.jl +++ b/test/DataDiGraph_utils_test.jl @@ -90,3 +90,28 @@ remove_edge!(dg, (1, 3)) @test_throws ErrorException remove_edge!(dg, 1, 2) @test_throws ErrorException remove_edge!(dg, 1, 6) end + +dg = DataDiGraph() +for i in 1:length(nodes) + add_node!(dg, nodes[i]) +end + +for i in 1:length(edges) + PlasmoData.add_edge!(dg, edges[i]) + add_edge_data!(dg, edges[i], edge_data[i]) +end + +agg_graph = aggregate(dg, [1, 5], "new_node", save_agg_edge_data = true, node_attributes_to_add = ["weight2"]) +agg_graph2 = aggregate(dg, [1, 5], "new_node", save_agg_edge_data = true) + +@testset "aggregate test 2" begin + @test agg_graph.node_data.attributes == ["weight2"] + @test length(agg_graph.node_data.attribute_map) == 1 + @test get_node_data(agg_graph, "new_node", "weight2") == 2 + @test get_node_data(agg_graph, 2, "weight2") == 0 + + @test agg_graph2.node_data.attributes == ["weight"] + @test length(agg_graph2.node_data.attribute_map) == 1 + @test get_node_data(agg_graph2, "new_node", "weight") == 2 + @test get_node_data(agg_graph2, 2, "weight") == 0 +end diff --git a/test/DataGraph_utils_test.jl b/test/DataGraph_utils_test.jl index 39cfb00..1e17334 100644 --- a/test/DataGraph_utils_test.jl +++ b/test/DataGraph_utils_test.jl @@ -169,4 +169,32 @@ agg_graph = aggregate(dg, [(2, 2), (2, 3)], "agg_node") @test test_edge_exists(agg_graph, (2, 1), "agg_node") @test test_edge_exists(agg_graph, (3, 2), "agg_node") @test test_edge_exists(agg_graph, (3, 3), "agg_node") + @test_throws ErrorException aggregate(dg, ["a", "b"], "agg_node") +end + + +dg = matrix_to_graph(matrix, diagonal = false) + +edge_vals = fill(1., length(dg.edges)) + +add_edge_dataset!(dg, edge_vals, "edge_weight") + +agg_graph = aggregate(dg, [(2, 2), (3, 2)], "agg_node", save_agg_edge_data = true) +agg_graph2 = aggregate(dg, [(2, 2), (3, 2)], "agg_node", save_agg_edge_data = true, node_attributes_to_add = ["weight2"]) + +dg_extra = matrix_to_graph(matrix, diagonal = false) +add_edge_dataset!(dg_extra, edge_vals, "weight") + +@testset "aggregate test 2" begin + @test agg_graph.node_data.attributes == ["weight", "edge_weight"] + @test length(agg_graph.node_data.attribute_map) == 2 + @test get_node_data(agg_graph, "agg_node", "edge_weight") == 1 + @test get_node_data(agg_graph, (1, 1), "edge_weight") == 0 + @test agg_graph2.node_data.attributes == ["weight", "weight2"] + @test length(agg_graph2.node_data.attribute_map) == 2 + @test get_node_data(agg_graph2, "agg_node", "weight2") == 1 + @test get_node_data(agg_graph2, (1, 1), "weight2") == 0 + @test_throws ErrorException aggregate(dg, [(2,2), (3,2)], "agg_node", save_agg_edge_data = true, node_attributes_to_add = ["edge_weight", "edge_weight2"]) + @test_throws ErrorException aggregate(dg, [(2,2), (3,2)], "agg_node", save_agg_edge_data = true, node_attributes_to_add = ["weight"]) + @test_throws ErrorException aggregate(dg_extra, [(2,2), (3,2)], "agg_node", save_agg_edge_data = true) end