From 2feda7a96cf7c601736d5e92287d68a1ea4b526d Mon Sep 17 00:00:00 2001 From: Kelechi Ukah Date: Sun, 20 Oct 2024 05:08:14 -0700 Subject: [PATCH] optimal transport formulation. wip --- src/clustering/abstractor.rs | 11 +- src/clustering/histogram.rs | 15 +- src/clustering/layer.rs | 97 ++++++------ src/clustering/metric.rs | 276 ++++++++++++++++++++++++----------- src/main.rs | 2 +- 5 files changed, 260 insertions(+), 141 deletions(-) diff --git a/src/clustering/abstractor.rs b/src/clustering/abstractor.rs index 667520c..485304d 100644 --- a/src/clustering/abstractor.rs +++ b/src/clustering/abstractor.rs @@ -38,6 +38,8 @@ impl Abstractor { .inner() // cluster turn .save() .inner() // cluster flop + .save() + .inner() // cluster preflop (but really just save flop.metric) .save(); } } @@ -236,9 +238,10 @@ impl Abstractor { /// 3. Write the extension (4 bytes) /// 4. Write the observation and abstraction pairs /// 5. Write the trailer (2 bytes) - pub fn save(&self, name: String) { - log::info!("saving abstraction lookup {}", name); - let ref mut file = File::create(format!("{}.abstraction.pgcopy", name)).expect("new file"); + pub fn save(&self, street: Street) { + log::info!("{:<32}{:<32}", "saving abstraction lookup", street); + let ref mut file = + File::create(format!("{}.abstraction.pgcopy", street)).expect("new file"); file.write_all(b"PGCOPY\n\xff\r\n\0").expect("header"); file.write_u32::(0).expect("flags"); file.write_u32::(0).expect("extension"); @@ -272,7 +275,7 @@ mod tests { .map(|o| (o, Abstraction::random())) .collect(), ); - save.save(street.to_string()); + save.save(street); // Load from disk let load = Abstractor::load_street(street); std::iter::empty() diff --git a/src/clustering/histogram.rs b/src/clustering/histogram.rs index e1a5f20..b89a7b9 100644 --- a/src/clustering/histogram.rs +++ b/src/clustering/histogram.rs @@ -29,8 +29,14 @@ impl Histogram { /// all witnessed Abstractions. /// treat this like an unordered array /// even though we use BTreeMap for struct. - pub fn support(&self) -> Vec<&Abstraction> { - self.contribution.keys().collect() + pub fn support(&self) -> impl Iterator { + self.contribution.keys() + } + pub fn normalized(&self) -> BTreeMap { + self.contribution + .iter() + .map(|(&a, &count)| (a, count as f32 / self.mass as f32)) + .collect() } /// useful only for k-means edge case of centroid drift @@ -39,6 +45,11 @@ impl Histogram { self.contribution.is_empty() } + /// size of the support + pub fn size(&self) -> usize { + self.contribution.len() + } + /// insert the Abstraction into our support, /// incrementing its local weight, /// incrementing our global norm. diff --git a/src/clustering/layer.rs b/src/clustering/layer.rs index 41f7390..e80964e 100644 --- a/src/clustering/layer.rs +++ b/src/clustering/layer.rs @@ -17,21 +17,6 @@ use rayon::iter::IntoParallelRefIterator; use rayon::iter::ParallelIterator; use std::collections::BTreeMap; -/// number of kmeans centroids. -/// this determines the granularity of the abstraction space. -/// -/// - CPU: O(N^2) for kmeans initialization -/// - CPU: O(N) for kmeans clustering -/// - RAM: O(N^2) for learned metric -/// - RAM: O(N) for learned centroids -const N_KMEANS_CENTROIDS: usize = 256; - -/// number of kmeans iterations. -/// this controls the precision of the abstraction space. -/// -/// - CPU: O(N) for kmeans clustering -const N_KMEANS_ITERATION: usize = 64; - /// Hierarchical K Means Learner. /// this is decomposed into the necessary data structures /// for kmeans clustering to occur for a given `Street`. @@ -61,6 +46,35 @@ pub struct Layer { } impl Layer { + /// number of kmeans centroids. + /// this determines the granularity of the abstraction space. + /// + /// - CPU: O(N^2) for kmeans initialization + /// - CPU: O(N) for kmeans clustering + /// - RAM: O(N^2) for learned metric + /// - RAM: O(N) for learned centroids + const fn k(street: Street) -> usize { + match street { + Street::Pref => 169, + Street::Flop => 8, + Street::Turn => 8, + Street::Rive => unreachable!(), + } + } + + /// number of kmeans iterations. + /// this controls the precision of the abstraction space. + /// + /// - CPU: O(N) for kmeans clustering + const fn t(street: Street) -> usize { + match street { + Street::Pref => 0, + Street::Flop => 128, + Street::Turn => 32, + Street::Rive => unreachable!(), + } + } + /// start with the River layer. everything is empty because we /// can generate `Abstractor` and `SmallSpace` from "scratch". /// - `lookup`: lazy equity calculation of river observations @@ -95,8 +109,8 @@ impl Layer { } /// save the current layer's `Metric` and `Abstractor` to disk pub fn save(self) -> Self { - self.metric.save(format!("{}", self.street.next())); // outer layer generates this purely (metric over projections) - self.lookup.save(format!("{}", self.street)); // while inner layer generates this (clusters) + self.metric.save(self.street.next()); // outer layer generates this purely (metric over projections) + self.lookup.save(self.street); // while inner layer generates this (clusters) self } @@ -115,7 +129,7 @@ impl Layer { /// /// we symmetrize the distance by averaging the EMDs in both directions. /// the distnace isn't symmetric in the first place only because our heuristic algo is not fully accurate - pub fn inner_metric(&self) -> Metric { + fn inner_metric(&self) -> Metric { log::info!( "{:<32}{:<32}", "computing metric", @@ -170,13 +184,13 @@ impl Layer { log::info!( "{:<32}{:<32}", "declaring abstractions", - format!("{} {} clusters", self.street, N_KMEANS_CENTROIDS) + format!("{} {} clusters", self.street, Self::k(self.street)) ); let ref mut rng = rand::thread_rng(); - let progress = Self::progress(N_KMEANS_CENTROIDS); + let progress = Self::progress(Self::k(self.street)); self.kmeans.expand(self.sample_uniform(rng)); progress.inc(1); - while self.kmeans.0.len() < N_KMEANS_CENTROIDS { + while self.kmeans.0.len() < Self::k(self.street) { self.kmeans.expand(self.sample_outlier(rng)); progress.inc(1); } @@ -189,17 +203,16 @@ impl Layer { log::info!( "{:<32}{:<32}", "clustering observations", - format!("{} {} iterations", self.street, N_KMEANS_ITERATION) + format!("{} {} iterations", self.street, Self::t(self.street)) ); - let progress = Self::progress(N_KMEANS_ITERATION); - for _ in 0..N_KMEANS_ITERATION { + let progress = Self::progress(Self::t(self.street)); + for _ in 0..Self::t(self.street) { let neighbors = self .points .0 .par_iter() .map(|(_, h)| self.nearest_neighbor(h)) .collect::>(); - self.kmeans.clear(); self.assign_nearest_neighbor(neighbors); self.assign_orphans_randomly(); progress.inc(1); @@ -211,36 +224,33 @@ impl Layer { /// by computing the EMD distance between the `Observation`'s `Histogram` and each `Centroid`'s `Histogram` /// and returning the `Abstraction` of the nearest `Centroid` fn assign_nearest_neighbor(&mut self, neighbors: Vec<(Abstraction, f32)>) { + self.kmeans.clear(); let mut loss = 0.; - for ((observation, histogram), (abstraction, distance)) in - std::iter::zip(self.points.0.iter_mut(), neighbors.iter()) - { - loss += distance * distance; - self.lookup.assign(abstraction, observation); - self.kmeans.absorb(abstraction, histogram); + for ((obs, hist), (abs, dist)) in self.points.0.iter_mut().zip(neighbors.iter()) { + loss += dist * dist; + self.lookup.assign(abs, obs); + self.kmeans.absorb(abs, hist); } - log::debug!("LOSS {:>12.8}", loss / self.points.0.len() as f32); + let loss = loss / self.points.0.len() as f32; + log::trace!("LOSS {:>12.8}", loss); } /// centroid drift may make it such that some centroids are empty /// so we reinitialize empty centroids with random Observations if necessary fn assign_orphans_randomly(&mut self) { for ref a in self.kmeans.orphans() { - log::warn!( - "{:<32}{:<32}", - "reassigning empty centroid", - format!("0x{}", a) - ); let ref mut rng = rand::thread_rng(); let ref sample = self.sample_uniform(rng); self.kmeans.absorb(a, sample); + log::debug!( + "{:<32}{:<32}", + "reassigned empty centroid", + format!("0x{}", a) + ); } } /// the first Centroid is uniformly random across all `Observation` `Histogram`s - fn sample_uniform(&self, rng: &mut R) -> Histogram - where - R: Rng, - { + fn sample_uniform(&self, rng: &mut R) -> Histogram { self.points .0 .values() @@ -251,10 +261,7 @@ impl Layer { /// each next Centroid is selected with probability proportional to /// the squared distance to the nearest neighboring Centroid. /// faster convergence, i guess. on the shoulders of giants - fn sample_outlier(&self, rng: &mut R) -> Histogram - where - R: Rng, - { + fn sample_outlier(&self, rng: &mut R) -> Histogram { let weights = self .points .0 diff --git a/src/clustering/metric.rs b/src/clustering/metric.rs index 01c4dad..97ccb39 100644 --- a/src/clustering/metric.rs +++ b/src/clustering/metric.rs @@ -1,8 +1,71 @@ +#![allow(unused)] + +use crate::cards::street::Street; use crate::clustering::abstraction::Abstraction; use crate::clustering::histogram::Histogram; use crate::clustering::xor::Pair; use std::collections::BTreeMap; +struct River; +impl Transport for River { + type M = Metric; + type X = Abstraction; // ::Equity(i8) + type Y = Abstraction; // ::Equity(i8) + type P = Histogram; + type Q = Histogram; + fn flow(&self, x: &Self::X, y: &Self::Y) -> f32 { + unreachable!() + } + fn cost(&self, x: &Self::P, y: &Self::Q, _: &Self::M) -> f32 { + // Self::variation(x, y) + // Self::euclidean(x, y) + // Self::chisq(x, y) + Self::variation(x, y) + } +} +/// different distance metrics over Equity Histograms +/// conveniently have properties of distributions over the [0, 1] interval. +impl River { + fn variation(x: &Histogram, y: &Histogram) -> f32 { + // assert!(matches!(x.peek(), Abstraction::Equity(_))); + // assert!(matches!(y.peek(), Abstraction::Equity(_))); + let mut total = 0.; + let mut cdf_x = 0.; + let mut cdf_y = 0.; + for abstraction in Abstraction::range() { + cdf_x += x.weight(abstraction); + cdf_y += y.weight(abstraction); + total += (cdf_x - cdf_y).abs(); + } + total / 2. + } + #[allow(unused)] + fn euclidean(x: &Histogram, y: &Histogram) -> f32 { + // assert!(matches!(x.peek(), Abstraction::Equity(_))); + // assert!(matches!(y.peek(), Abstraction::Equity(_))); + let mut total = 0.; + for abstraction in Abstraction::range() { + let x_density = x.weight(abstraction); + let y_density = y.weight(abstraction); + total += (x_density - y_density).powi(2); + } + total.sqrt() + } + #[allow(unused)] + fn chisq(x: &Histogram, y: &Histogram) -> f32 { + // assert!(matches!(x.peek(), Abstraction::Equity(_))); + // assert!(matches!(y.peek(), Abstraction::Equity(_))); + let mut total = 0.; + for abstraction in Abstraction::range() { + let x_density = x.weight(abstraction); + let y_density = y.weight(abstraction); + let delta = x_density - y_density; + total += delta * delta / (x_density + y_density); + } + total + } +} + /// Distance metric for kmeans clustering. /// encapsulates distance between `Abstraction`s of the "previous" hierarchy, /// as well as: distance between `Histogram`s of the "current" hierarchy. @@ -10,115 +73,78 @@ use std::collections::BTreeMap; pub struct Metric(pub BTreeMap); impl Metric { - /// This function *approximates* the Earth Mover's Distance (EMD) between two histograms. - /// EMD is a measure of the distance between two probability distributions. - /// It is calculated by finding the minimum amount of "work" required to transform - /// one distribution into the other. - /// - /// for Histogram where T: Ord, we can efficiently and accurately calculate EMD - /// 1. sort elements of the support (already done for free because BTreeMap) - /// 2. produce CDF of source and target distributions - /// 3. integrate difference between CDFs over support - /// - /// we only have the luxury of this efficient O(N) calculation on Street::Turn, - /// where the support is over the Abstraction::Equity(i8) variant. pub fn emd(&self, source: &Histogram, target: &Histogram) -> f32 { match source.peek() { - Abstraction::Equity(_) => Self::difference(source, target), - Abstraction::Random(_) => self.wasserstein(source, target), - Abstraction::Pocket(_) => unreachable!("no preflop emd"), + Abstraction::Equity(_) => River::variation(source, target), + Abstraction::Random(_) => self.greedy(source, target), + Abstraction::Pocket(_) => 1., // unreachable!("no preflop emd"), } } - /// generated recursively and hierarchically - /// we can calculate the distance between two abstractions - /// by eagerly finding distance between their centroids - fn distance(&self, x: &Abstraction, y: &Abstraction) -> f32 { + fn lookup(&self, x: &Abstraction, y: &Abstraction) -> f32 { if x == y { 0. } else { - match (x, y) { - (Abstraction::Random(_), Abstraction::Random(_)) => self - .0 - .get(&Pair::from((x, y))) - .copied() - .expect("precalculated distance"), - _ => unreachable!("no distance lookup for equity or pocket abstractions"), - } + *self.0.get(&Pair::from((x, y))).unwrap() } } - /// here we have the luxury of calculating EMD - /// over 1-dimensional support of Abstraction::Equity - /// so we just integrate the absolute difference between CDFs - fn difference(x: &Histogram, y: &Histogram) -> f32 { - let mut total = 0.; - let mut cdf_x = 0.; - let mut cdf_y = 0.; - for abstraction in Abstraction::range() { - cdf_x += x.weight(abstraction); - cdf_y += y.weight(abstraction); - total += (cdf_x - cdf_y).abs(); - } - total + fn image(&self, x: &Histogram) -> BTreeMap { + x.support() + .into_iter() + .map(|&a| (a, 1. / x.size() as f32)) + .collect() } - /// Beware the asymmetry: - /// EMD(X,Y) != EMD(Y,X) - /// Centroid should be the "hole" (sink) in the EMD calculation - fn wasserstein(&self, source: &Histogram, target: &Histogram) -> f32 { - let x = source.support(); - let y = target.support(); - let mut energy = 0.; - let mut hasmoved = x - .iter() - .map(|&a| (a, false)) - .collect::>(); - let mut notmoved = x - .iter() - .map(|&a| (a, 1.0 / x.len() as f32)) - .collect::>(); - let mut unfilled = y - .iter() - .map(|&a| (a, target.weight(a))) - .collect::>(); // this is effectively a clone - for _ in 0..y.len() { - for pile in x.iter() { - // skip if we have already moved all the earth from this source - if *hasmoved.get(pile).expect("in x domain") { - continue; - } - // find the nearest neighbor of X (source) from Y (sink) - let (hole, distance) = y + fn range(&self, y: &Histogram) -> BTreeMap { + y.support() + .into_iter() + .map(|&a| (a, y.weight(&a))) + .collect() + } + + fn greedy(&self, x: &Histogram, y: &Histogram) -> f32 { + let mut cost = 0.; + let mut sources = self.image(x); + let mut targets = self.range(y); + 'cost: while sources.values().any(|&v| v > 0.) { + 'flow: for (x, pile) in sources + .iter() + .cycle() + .map(|(&x, &pile)| (x, pile)) + .collect::>() + { + let nearest = targets .iter() - .map(|sink| (*sink, self.distance(pile, sink))) - .min_by(|(_, a), (_, b)| a.partial_cmp(b).expect("not NaN")) - .expect("y domain not empty"); - // decide if we can remove earth from both distributions - let demand = *notmoved.get(pile).expect("in x domain"); - let vacant = *unfilled.get(hole).expect("in y domain"); - if vacant > 0. { - energy += distance * demand.min(vacant); - } else { - continue; - } - // remove earth from both distributions - if demand > vacant { - *notmoved.get_mut(pile).expect("in x domain") -= vacant; - *unfilled.get_mut(hole).expect("in y domain") = 0.; - } else { - *unfilled.get_mut(hole).expect("in y domain") -= demand; - *notmoved.get_mut(pile).expect("in x domain") = 0.; - *hasmoved.get_mut(pile).expect("in x domain") = true; + .filter(|(_, &hole)| hole > 0.) + .map(|(&y, &hole)| ((y, hole), self.distance(&x, &y))) + .min_by(|(_, y1), (_, y2)| y1.partial_cmp(y2).unwrap()); + match nearest { + None => break 'cost, + Some(((y, hole), distance)) => { + if pile > hole { + let earth = hole; + cost += distance * earth; + *sources.get_mut(&x).unwrap() -= earth; + *targets.get_mut(&y).unwrap() = 0.; + continue 'flow; + } else { + let earth = pile; + cost += distance * earth; + *sources.get_mut(&x).unwrap() = 0.; + *targets.get_mut(&y).unwrap() -= earth; + continue 'flow; + } + } } } } - energy + cost } /// save profile to disk in a PGCOPY compatible format - pub fn save(&self, street: String) { - log::info!("uploading abstraction metric {}", street); + pub fn save(&self, street: Street) { + log::info!("{:<32}{:<32}", "uploading abstraction metric", street); use byteorder::BigEndian; use byteorder::WriteBytesExt; use std::fs::File; @@ -138,6 +164,78 @@ impl Metric { } } +trait Support {} +trait Density { + type X: Support; + fn density(&self, x: &Self::X) -> f32; + fn support(&self) -> impl Iterator; +} +trait Measure { + type X: Support; + type Y: Support; + fn distance(&self, x: &Self::X, y: &Self::Y) -> f32; +} +trait Transport { + type X: Support; + type Y: Support; + type P: Density; + type Q: Density; + type M: Measure; + fn flow(&self, x: &Self::X, y: &Self::Y) -> f32; + fn cost(&self, p: &Self::P, q: &Self::Q, m: &Self::M) -> f32 { + let mut cost = 0.; + for x in p.support() { + for y in q.support() { + let dx = p.density(x); + let dy = q.density(y); + let area = m.distance(x, y); + let flux = self.flow(x, y); + cost += area * flux * dx * dy; + } + } + cost + } +} + +impl Support for f32 {} +impl Support for Abstraction {} +impl Density for Histogram { + type X = Abstraction; + fn density(&self, x: &Self::X) -> f32 { + self.weight(x) + } + fn support(&self) -> impl Iterator { + self.support().into_iter() + } +} +impl Measure for Metric { + type X = Abstraction; + type Y = Abstraction; + fn distance(&self, x: &Self::X, y: &Self::Y) -> f32 { + match (x, y) { + (Self::X::Random(_), Self::Y::Random(_)) => self.lookup(x, y), + (Self::X::Equity(a), Self::Y::Equity(b)) => (a - b).abs() as f32, + _ => unreachable!("only equity distance for river abstractions"), + } + } +} + +impl Transport for Metric { + type M = Self; + type X = Abstraction; + type Y = Abstraction; + type P = Histogram; + type Q = Histogram; + fn flow(&self, x: &Self::X, y: &Self::Y) -> f32 { + let _ = y; + let _ = x; + todo!("if we want to we can explicitly use this to calculate P_ij. like if we use a LP solver to get an exact solution, this would yield matrix elements of the optimal P_ij transport plan. alternatively, we can update some internal state while we iterate through self.cost(), but that'd require another method that does some initalization so that it can mutate itself and allow cost + flow to become lookups. effectively, eagerly computing EMD.") + } + fn cost(&self, x: &Self::P, y: &Self::Q, _: &Self::M) -> f32 { + self.greedy(x, y) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/main.rs b/src/main.rs index b5616be..dfbcaaf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,7 +69,7 @@ fn logging() { .expect("time moves slow") .as_secs(); let file = simplelog::WriteLogger::new( - log::LevelFilter::Debug, + log::LevelFilter::Trace, config.clone(), std::fs::File::create(format!("logs/{}.log", time)).expect("create log file"), );