Skip to content

Commit

Permalink
test routing
Browse files Browse the repository at this point in the history
  • Loading branch information
PikaCat-OuO committed Jan 10, 2025
1 parent b9bbadb commit af885fe
Show file tree
Hide file tree
Showing 10 changed files with 168 additions and 266 deletions.
13 changes: 3 additions & 10 deletions src/evaluate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,7 @@ Value Eval::evaluate(const Eval::NNUE::Network& network,

assert(!pos.checkers());

auto [psqt, positional] = network.evaluate(pos, &caches.cache);
Value nnue = psqt + positional;
int nnueComplexity = std::abs(psqt - positional);

// Blend optimism and eval with nnue complexity
optimism += optimism * nnueComplexity / 485;
nnue -= nnue * nnueComplexity / 11683;
Value nnue = network.evaluate(pos, &caches.cache);

int mm = pos.major_material() / 40;
int v = (nnue * (443 + mm) + optimism * (76 + mm)) / 503;
Expand Down Expand Up @@ -82,9 +76,8 @@ std::string Eval::trace(Position& pos, const Eval::NNUE::Network& network) {

ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);

auto [psqt, positional] = network.evaluate(pos, &caches->cache);
Value v = psqt + positional;
v = pos.side_to_move() == WHITE ? v : -v;
Value v = network.evaluate(pos, &caches->cache);
v = pos.side_to_move() == WHITE ? v : -v;
ss << "NNUE evaluation " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n";

v = evaluate(network, pos, *caches, VALUE_ZERO);
Expand Down
8 changes: 0 additions & 8 deletions src/nnue/features/half_ka_v2_hm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,6 @@ IndexType HalfKAv2_hm::make_attack_bucket(const Position& pos, Color c) {
return AttackBucket[pos.count<ROOK>(c)][pos.count<KNIGHT>(c)][pos.count<CANNON>(c)];
}

// Get layer stack bucket
IndexType HalfKAv2_hm::make_layer_stack_bucket(const Position& pos) {
Color us = pos.side_to_move();
return LayerStackBuckets[pos.count<ROOK>(us)][pos.count<ROOK>(~us)]
[pos.count<KNIGHT>(us) + pos.count<CANNON>(us)]
[pos.count<KNIGHT>(~us) + pos.count<CANNON>(~us)];
}

// Index of a feature for a given king position and another piece on some square
template<Color Perspective>
inline IndexType HalfKAv2_hm::make_index(Square s, Piece pc, int bucket, bool mirror) {
Expand Down
28 changes: 0 additions & 28 deletions src/nnue/features/half_ka_v2_hm.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,41 +166,13 @@ class HalfKAv2_hm {
return v;
}();

// LayerStack buckets
static constexpr auto LayerStackBuckets = [] {
std::array<std::array<std::array<std::array<uint8_t, 5>, 5>, 3>, 3> v{};
for (uint8_t us_rook = 0; us_rook <= 2; ++us_rook)
for (uint8_t opp_rook = 0; opp_rook <= 2; ++opp_rook)
for (uint8_t us_knight_cannon = 0; us_knight_cannon <= 4; ++us_knight_cannon)
for (uint8_t opp_knight_cannon = 0; opp_knight_cannon <= 4; ++opp_knight_cannon)
v[us_rook][opp_rook][us_knight_cannon][opp_knight_cannon] = [&] {
if (us_rook == opp_rook)
return us_rook * 4
+ int(us_knight_cannon + opp_knight_cannon >= 4) * 2
+ int(us_knight_cannon == opp_knight_cannon);
else if (us_rook == 2 && opp_rook == 1)
return 12;
else if (us_rook == 1 && opp_rook == 2)
return 13;
else if (us_rook > 0 && opp_rook == 0)
return 14;
else if (us_rook == 0 && opp_rook > 0)
return 15;
return -1;
}();
return v;
}();

// Maximum number of simultaneously active features.
static constexpr IndexType MaxActiveDimensions = 32;
using IndexList = ValueList<IndexType, MaxActiveDimensions>;

// Get attack bucket
static IndexType make_attack_bucket(const Position& pos, Color c);

// Get layer stack bucket
static IndexType make_layer_stack_bucket(const Position& pos);

// Index of a feature for a given king position and another piece on some square
template<Color Perspective>
static IndexType make_index(Square s, Piece pc, int bucket, bool mirror);
Expand Down
30 changes: 19 additions & 11 deletions src/nnue/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ Network::Network(const Network& other) :
if (other.featureTransformer)
featureTransformer = make_unique_large_page<FeatureTransformer>(*other.featureTransformer);

if (other.router)
router = make_unique_aligned<RouterArchitecture>(*other.router);

network = make_unique_aligned<NetworkArchitecture[]>(LayerStacks);

if (!other.network)
Expand All @@ -75,6 +78,8 @@ Network& Network::operator=(const Network& other) {

featureTransformer = make_unique_large_page<FeatureTransformer>(*other.featureTransformer);

router = make_unique_aligned<RouterArchitecture>(*other.router);

network = make_unique_aligned<NetworkArchitecture[]>(LayerStacks);

if (!other.network)
Expand Down Expand Up @@ -135,7 +140,7 @@ bool Network::save(const std::optional<std::string>& filename) const {
}


NetworkOutput Network::evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const {
Value Network::evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const {
// We manually align the arrays on the stack because with gcc < 9.3
// overaligning stack variables with alignas() doesn't work correctly.

Expand All @@ -153,11 +158,11 @@ NetworkOutput Network::evaluate(const Position& pos, AccumulatorCaches::Cache* c

ASSERT_ALIGNED(transformedFeatures, alignment);

const int bucket = FeatureSet::make_layer_stack_bucket(pos);
const auto psqt = featureTransformer->transform(pos, cache, transformedFeatures, bucket);
const auto positional = network[bucket].propagate(transformedFeatures);
featureTransformer->transform(pos, cache, transformedFeatures);
const int bucket = router->propagate(transformedFeatures);
const auto nnue = network[bucket].propagate(transformedFeatures);

return {static_cast<Value>(psqt / OutputScale), static_cast<Value>(positional / OutputScale)};
return static_cast<Value>(nnue / OutputScale);
}


Expand Down Expand Up @@ -226,16 +231,14 @@ NnueEvalTrace Network::trace_evaluate(const Position& pos, AccumulatorCaches::Ca
ASSERT_ALIGNED(transformedFeatures, alignment);

NnueEvalTrace t{};
t.correctBucket = FeatureSet::make_layer_stack_bucket(pos);
for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
{
const auto materialist =
featureTransformer->transform(pos, cache, transformedFeatures, bucket);
const auto positional = network[bucket].propagate(transformedFeatures);
featureTransformer->transform(pos, cache, transformedFeatures);
const auto nnue = network[bucket].propagate(transformedFeatures);

t.psqt[bucket] = static_cast<Value>(materialist / OutputScale);
t.positional[bucket] = static_cast<Value>(positional / OutputScale);
t.nnue[bucket] = static_cast<Value>(nnue / OutputScale);
}
t.correctBucket = router->propagate(transformedFeatures);

return t;
}
Expand All @@ -255,6 +258,7 @@ void Network::load_user_net(const std::string& dir, const std::string& evalfileP

void Network::initialize() {
featureTransformer = make_unique_large_page<FeatureTransformer>();
router = make_unique_aligned<RouterArchitecture>();
network = make_unique_aligned<NetworkArchitecture[]>(LayerStacks);
}

Expand Down Expand Up @@ -312,6 +316,8 @@ bool Network::read_parameters(std::istream& stream, std::string& netDescription)
return false;
if (!Detail::read_parameters(stream, *featureTransformer))
return false;
if (!Detail::read_parameters(stream, *router))
return false;
for (std::size_t i = 0; i < LayerStacks; ++i)
{
if (!Detail::read_parameters(stream, network[i]))
Expand All @@ -326,6 +332,8 @@ bool Network::write_parameters(std::ostream& stream, const std::string& netDescr
return false;
if (!Detail::write_parameters(stream, *featureTransformer))
return false;
if (!Detail::write_parameters(stream, *router))
return false;
for (std::size_t i = 0; i < LayerStacks; ++i)
{
if (!Detail::write_parameters(stream, network[i]))
Expand Down
7 changes: 4 additions & 3 deletions src/nnue/network.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ class Position;

namespace Eval::NNUE {

using NetworkOutput = std::tuple<Value, Value>;

class Network {
public:
Network(EvalFile file) :
Expand All @@ -54,7 +52,7 @@ class Network {
void load(const std::string& rootDirectory, std::string evalfilePath);
bool save(const std::optional<std::string>& filename) const;

NetworkOutput evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const;
Value evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const;

void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache) const;

Expand All @@ -78,6 +76,9 @@ class Network {
// Input feature converter
LargePagePtr<FeatureTransformer> featureTransformer;

// Router
AlignedPtr<RouterArchitecture> router;

// Evaluation function
AlignedPtr<NetworkArchitecture[]> network;

Expand Down
17 changes: 7 additions & 10 deletions src/nnue/nnue_accumulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,12 @@

namespace Stockfish::Eval::NNUE {

using BiasType = std::int16_t;
using PSQTWeightType = std::int32_t;
using IndexType = std::uint32_t;
using BiasType = std::int16_t;
using IndexType = std::uint32_t;

// Class that holds the result of affine transformation of input features
struct alignas(CacheLineSize) Accumulator {
std::int16_t accumulation[COLOR_NB][TransformedFeatureDimensions];
std::int32_t psqtAccumulation[COLOR_NB][PSQTBuckets];
bool computed[COLOR_NB];
};

Expand Down Expand Up @@ -71,18 +69,17 @@ struct AccumulatorCaches {
struct alignas(CacheLineSize) Cache {

struct alignas(CacheLineSize) Entry {
BiasType accumulation[TransformedFeatureDimensions];
PSQTWeightType psqtAccumulation[PSQTBuckets];
Bitboard byColorBB[COLOR_NB];
Bitboard byTypeBB[PIECE_TYPE_NB];
BiasType accumulation[TransformedFeatureDimensions];
Bitboard byColorBB[COLOR_NB];
Bitboard byTypeBB[PIECE_TYPE_NB];

// To initialize a refresh entry, we set all its bitboards empty,
// so we put the biases in the accumulation, without any weights on top
void clear(const BiasType* biases) {

std::memcpy(accumulation, biases, sizeof(accumulation));
std::memset((uint8_t*) this + offsetof(Entry, psqtAccumulation), 0,
sizeof(Entry) - offsetof(Entry, psqtAccumulation));
std::memset((uint8_t*) this + offsetof(Entry, byColorBB), 0,
sizeof(Entry) - offsetof(Entry, byColorBB));
}
};

Expand Down
119 changes: 117 additions & 2 deletions src/nnue/nnue_architecture.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,123 @@ using FeatureSet = Features::HalfKAv2_hm;

// Number of input feature dimensions after conversion
constexpr IndexType TransformedFeatureDimensions = 2048;
constexpr IndexType PSQTBuckets = 16;
constexpr IndexType LayerStacks = 16;
constexpr IndexType LayerStacks = 64;

struct RouterArchitecture {
static constexpr int FC_0_OUTPUTS = 16;
static constexpr int FC_1_OUTPUTS = 32;

Layers::AffineTransformSparseInput<TransformedFeatureDimensions, FC_0_OUTPUTS> fc_0;
Layers::SqrClippedReLU<FC_0_OUTPUTS> ac_sqr_0;
Layers::ClippedReLU<FC_0_OUTPUTS> ac_0;
Layers::AffineTransform<FC_0_OUTPUTS * 2, FC_1_OUTPUTS> fc_1;
Layers::ClippedReLU<FC_1_OUTPUTS> ac_1;
Layers::AffineTransform<FC_1_OUTPUTS, LayerStacks> fc_2;

// Hash value embedded in the evaluation file
static constexpr std::uint32_t get_hash_value() {
// input slice hash
std::uint32_t hashValue = 0xEC42E90Du;
hashValue ^= TransformedFeatureDimensions * 2;

hashValue = decltype(fc_0)::get_hash_value(hashValue);
hashValue = decltype(ac_0)::get_hash_value(hashValue);
hashValue = decltype(fc_1)::get_hash_value(hashValue);
hashValue = decltype(ac_1)::get_hash_value(hashValue);
hashValue = decltype(fc_2)::get_hash_value(hashValue);

return hashValue;
}

// Read network parameters
bool read_parameters(std::istream& stream) {
return fc_0.read_parameters(stream) && ac_0.read_parameters(stream)
&& fc_1.read_parameters(stream) && ac_1.read_parameters(stream)
&& fc_2.read_parameters(stream);
}

// Write network parameters
bool write_parameters(std::ostream& stream) const {
return fc_0.write_parameters(stream) && ac_0.write_parameters(stream)
&& fc_1.write_parameters(stream) && ac_1.write_parameters(stream)
&& fc_2.write_parameters(stream);
}

std::int32_t argmax_avx2(const decltype(fc_2)::OutputType* array) {
__m256i max_vals = _mm256_set1_epi32(
std::numeric_limits<std::int32_t>::min()); // Initialize with minimum int32 values.
__m256i max_indices = _mm256_setzero_si256(); // Indices for max values.
__m256i indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); // Index vector.
__m256i index_increment = _mm256_set1_epi32(8); // Increment for indices.

for (IndexType i = 0; i < LayerStacks; i += 8)
{
__m256i current_vals = _mm256_loadu_si256((const __m256i*) &array[i]);

// Compare current values with max_vals.
__m256i mask = _mm256_cmpgt_epi32(current_vals, max_vals);

// Update max_vals and max_indices where mask is true.
max_vals = _mm256_blendv_epi8(max_vals, current_vals, mask);
max_indices = _mm256_blendv_epi8(max_indices, indices, mask);

// Increment indices.
indices = _mm256_add_epi32(indices, index_increment);
}

std::int32_t max_value = std::numeric_limits<std::int32_t>::min();
std::int32_t max_index = -1;

// Find the max value and index within the AVX2 registers.
alignas(CacheLineSize) int32_t max_vals_array[8];
alignas(CacheLineSize) int32_t max_indices_array[8];
_mm256_store_si256((__m256i*) max_vals_array, max_vals);
_mm256_store_si256((__m256i*) max_indices_array, max_indices);

for (int j = 0; j < 8; ++j)
{
if (max_vals_array[j] > max_value)
{
max_value = max_vals_array[j];
max_index = max_indices_array[j];
}
}

return max_index;
}

std::int32_t propagate(const TransformedFeatureType* transformedFeatures) {
struct alignas(CacheLineSize) Buffer {
alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out;
alignas(CacheLineSize) decltype(ac_0)::OutputType ac_0_out[FC_0_OUTPUTS * 2];
alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out;
alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out;
alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out;

Buffer() { std::memset(this, 0, sizeof(*this)); }
};

#if defined(__clang__) && (__APPLE__)
// workaround for a bug reported with xcode 12
static thread_local auto tlsBuffer = std::make_unique<Buffer>();
// Access TLS only once, cache result.
Buffer& buffer = *tlsBuffer;
#else
alignas(CacheLineSize) static thread_local Buffer buffer;
#endif

fc_0.propagate(transformedFeatures, buffer.fc_0_out);
ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_0_out);
ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out + FC_0_OUTPUTS);
fc_1.propagate(buffer.ac_0_out, buffer.fc_1_out);
ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out);
fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out);

std::int32_t outputValue = argmax_avx2(buffer.fc_2_out);

return outputValue;
}
};

struct NetworkArchitecture {
static constexpr int FC_0_OUTPUTS = 15;
Expand Down
Loading

0 comments on commit af885fe

Please sign in to comment.