update a note

hxf0223 · Aug 26, 2024 · cc93eaa · cc93eaa
1 parent 5432631
commit cc93eaa
Show file tree

Hide file tree

Showing 2 changed files with 210 additions and 17 deletions.
diff --git a/_posts/2024-08-10-algrithm-DBSCAN.md b/_posts/2024-08-10-algrithm-DBSCAN.md
diff --git a/_posts/2024-08-26-density-algrithm-knn.md b/_posts/2024-08-26-density-algrithm-knn.md
@@ -0,0 +1,210 @@
+---
+title: 聚类算法（密度）：基于 nano-flann
+date: 2024-08-26 +0800 # 2022-01-01 13:14:15 +0800 只写日期也行；不写秒也行；这样也行 2022-03-09T00:55:42+08:00
+categories: [算法]
+tags: [算法, c++]      # TAG names should always be lowercase
+
+# 以下默认false
+math: true
+mermaid: true
+# pin: true
+---
+
+## 1. 类点云数据聚类算法
+
+采集到的二维点云数据(samples)，生成`K-D`搜索树，使用广度优先搜索，聚合成`block`数据。后续的识别/分类算法，在`block`数据基础上进行。
+
+由于使用点云处理库`PCL`比较庞大，以及其依的`FLANN`基于`C++14`，使用`C++17/20`导致在自定义点云数据结构时，编译有些`STL`算法库被废弃，编译出错。故使用 [nanoflann](https://github.com/jlblancoc/nanoflann)。
+
+### 1.1 自定义点云数据结构
+
+```c++
+#pragma once
+// test\test_flann\flann_adaptor.h
+
+#include <vector>
+
+#include "3rd_utils.h"
+#include "typedef.h"
+
+struct PointCloud {
+  using Point = rias::data_type::sample_data_t;
+  using coord_t = double;  //!< The type of each coordinate
+
+  std::vector<Point> pts;
+
+  // Must return the number of data points
+  inline size_t kdtree_get_point_count() const { return pts.size(); }
+
+  // Returns the dim'th component of the idx'th point in the class:
+  // Since this is inlined and the "dim" argument is typically an immediate
+  // value, the
+  //  "if/else's" are actually solved at compile time.
+  inline double kdtree_get_pt(const size_t idx, const size_t dim) const {
+    CHECK2(dim < 2, "Invalid dimension: " << dim);
+    if (dim == 0) return pts[idx].mapped_pos_.x_;
+    else return pts[idx].mapped_pos_.y_;
+  }
+
+  // Optional bounding-box computation: return false to default to a standard
+  // bbox computation loop.
+  //   Return true if the BBOX was already computed by the class and returned
+  //   in "bb" so it can be avoided to redo it again. Look at bb.size() to
+  //   find out the expected dimensionality (e.g. 2 or 3 for point clouds)
+  template <class BBOX>
+  bool kdtree_get_bbox(BBOX& /* bb */) const {
+    return false;
+  }
+};
+```
+
+### 1.2 基于 K-D 树构建广度优先搜索算法
+
+```c++
+#pragma once
+// test\test_flann\point_cloud_algorithm.h
+
+#include <queue>
+#include <vector>
+
+#pragma warning(push)
+#pragma warning(disable : 4267)  // disable conversion warning
+#include "nanoflann/nanoflann.hpp"
+#pragma warning(pop)
+
+#include "3rd_utils.h"
+
+#include "flann_adaptor.h"
+#include "typedef.h"
+
+namespace rias::alg {
+
+class BFSDensitySampleSearch {
+  static constexpr int32_t kDIM = 2;
+  using my_kd_tree_t = nanoflann::KDTreeSingleIndexAdaptor<nanoflann::L2_Simple_Adaptor<double, PointCloud>, PointCloud, kDIM>;
+  using sample_t = rias::data_type::sample_data_t;
+
+ public:
+  BFSDensitySampleSearch(const std::vector<sample_t>& pts) : pts_(pts) {}
+  ~BFSDensitySampleSearch() = default;
+
+  BFSDensitySampleSearch& setRadius(double radius) {
+    radius_ = radius;
+    return *this;
+  }
+
+  BFSDensitySampleSearch& setMinPts(double minPts) {
+    minPts_ = minPts;
+    return *this;
+  }
+
+  const std::vector<std::vector<size_t>>& clusters() const { return clusters_; }
+
+  BFSDensitySampleSearch& search() {
+    PointCloud cloud;
+    cloud.pts = pts_;
+
+    CHECK1(sizeof(double[kDIM]) == sizeof(data_type::point_double_t), "sizeof(double[kDIM])");
+    my_kd_tree_t index(kDIM /*dim*/, cloud, {10 /* max leaf */});
+    std::vector<double> densities(pts_.size(), double{});
+
+    for (size_t i = 0; i < pts_.size(); i++) {
+      const auto* q_pt = (double*)(&pts_[i].mapped_pos_);
+      std::vector<nanoflann::ResultItem<uint32_t, double>> ret_matches;
+      const size_t n_matched = index.radiusSearch(q_pt, radius_, ret_matches);
+      densities[i] = static_cast<double>(n_matched);
+    }
+
+    std::vector<bool> visited(pts_.size(), false);
+    for (size_t i = 0; i < pts_.size(); i++) {
+      if (visited[i] || densities[i] < minPts_) {  // 0. skip visited or low density points
+        continue;
+      }
+
+      std::vector<size_t> cluster;
+      std::queue<size_t> q;
+
+      q.push(i);  // 1. add self ot queue at first
+      visited[i] = true;
+
+      while (!q.empty()) {
+        const auto idx = q.front();
+        q.pop();
+
+        cluster.push_back(idx);  // 2. elements in the same queue are in the same cluster
+
+        const auto* q_pt = (double*)(&pts_[idx].mapped_pos_);
+        std::vector<nanoflann::ResultItem<uint32_t, double>> ret_matches;
+        const size_t n_matched = index.radiusSearch(q_pt, radius_, ret_matches);
+
+        for (size_t j = 0; j < n_matched; j++) {  // 3. add matched points to the same queue
+          const auto n_idx = ret_matches[j].first;
+          // const auto n_dist = ret_matches[j].second;
+          if (!visited[n_idx] && densities[n_idx] >= minPts_) {
+            visited[n_idx] = true;
+            q.push(n_idx);
+          }
+        }
+      }
+
+      // 4. a cluster done (one or more points)
+      clusters_.push_back(cluster);
+    }
+
+    return *this;
+  }
+
+ private:
+  const std::vector<sample_t>& pts_;
+  double radius_{1.0}, minPts_{1.0};
+
+  std::vector<std::vector<size_t>> clusters_;
+};
+
+}  // namespace rias::alg
+```
+
+### 1.3 测试代码
+
+```c++
+{
+    const auto t0 = std::chrono::high_resolution_clock::now();
+
+    auto dss = BFSDensitySampleSearch(sample_ds->ds_).setRadius(3.0).setMinPts(1.0);
+    auto clusters = dss.search().clusters();
+
+    const auto t1 = std::chrono::high_resolution_clock::now();
+    const auto dur = std::chrono::duration<double, std::milli>(t1 - t0).count();
+    spdlog::info("search in {} points. found {} clusters. time: {:.3f} seconds\n", sample_ds->ds_.size(), clusters.size(),
+                 dur / 1000.0);
+
+    /*for (const auto& cluster : clusters) {
+      //spdlog::info("{}", fmt::join(cluster, ", "));
+      spdlog::info("cluster: {}", cluster);
+    }*/
+  }
+```
+
+### 1.4 测试
+
+```bash
+[2024-08-26 21:47:35.648] [warning] [dataxy_loader.cc:146] under flow 870 samples in dataset
+[2024-08-26 21:47:35.649] [info] load data from file: 0.37393689999999996 seconds
+
+[2024-08-26 21:47:37.740] [info] search in 1503894 points. found 438931 clusters. time: 2.090 seconds
+```
+
+TODO: 测试小数据集下的性能对比。
+
+### 1.5 参考
+
+- [nanoflann](https://github.com/jlblancoc/nanoflann)
+- [四叉树、八叉树、BSP树、K-D树](https://www.cnblogs.com/KillerAery/p/10878367.html)
+- [数据结构-k-d树](https://yanglei253.github.io/2020/07/11/dataStructure/dataStructure-kdtree/)
+- [nanoflann库使用笔记](https://zxl19.github.io/nanoflann-note/)
+
+## 聚类算法资料收集
+
+- [几种常用的基于密度的聚类算法](https://www.cnblogs.com/alterwl/p/density-based-clustering.html)
+- [DBSCAN密度聚类算法](https://www.cnblogs.com/pinard/p/6208966.html)
+- [K紧邻法(KNN)原理小结](https://www.cnblogs.com/pinard/p/6061661.html)