Skip to content

Commit

Permalink
Switch decompression algorithm to zstandard
Browse files Browse the repository at this point in the history
1. Improved decompression speed leads to faster engine startups(around 4 times faster).
2. Remove compatibility of loading not compressed nnue files.
3. Use some GLOB operations to simplify makefile.
  • Loading branch information
PikaCat-OuO committed Sep 28, 2024
1 parent 6b1d57c commit 36eca8d
Show file tree
Hide file tree
Showing 42 changed files with 25,347 additions and 13,495 deletions.
26 changes: 9 additions & 17 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,13 @@ BINDIR = $(PREFIX)/bin
PGOBENCH = $(WINE_PATH) ./$(EXE) bench

### Source and object files
SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
misc.cpp movegen.cpp movepick.cpp position.cpp \
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp \
nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp \
external/zip.cpp
SRCS = $(shell find . -name '*.cpp') $(shell find . -name '*.S')

HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h magics.h \
nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \
nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
search.h thread.h thread_win32_osx.h timeman.h \
tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h \
external/zip.h external/miniz.h
HEADERS = $(shell find . -name '*.h')

OBJS = $(notdir $(SRCS:.cpp=.o))
OBJS = $(notdir $(patsubst %.cpp,%.o,$(patsubst %.S,%.o,$(SRCS))))

VPATH = external:nnue:nnue/features
VPATH = $(shell find . -type d | tr '\n' ':')

### ==========================================================================
### Section 2. High-level Configuration
Expand Down Expand Up @@ -930,12 +919,12 @@ clean: objclean profileclean

# clean binaries and objects
objclean:
@rm -f pikafish pikafish.exe *.o ./external/*.o ./nnue/*.o ./nnue/features/*.o
@rm -f pikafish pikafish.exe $(shell find . -name '*.o')

# clean auxiliary profiling files
profileclean:
@rm -rf profdir
@rm -f bench.txt *.gcda *.gcno ./external/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s PGOBENCH.out
@rm -f bench.txt $(shell find . -name '*.gcda' -o -name '*.gcno') *.s PGOBENCH.out
@rm -f pikafish.profdata *.profraw
@rm -f pikafish.*args*
@rm -f pikafish.*lt*
Expand Down Expand Up @@ -1021,6 +1010,9 @@ config-sanity: net
$(EXE): $(OBJS)
+$(CXX) -o $@ $(OBJS) $(LDFLAGS)

%.o: %.S
$(CXX) $(CXXFLAGS) -c -o $@ $<

# Force recompilation to ensure version info is up-to-date
misc.o: FORCE
FORCE:
Expand Down
54 changes: 54 additions & 0 deletions src/external/common/allocations.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

/* This file provides custom allocation primitives
*/

#define ZSTD_DEPS_NEED_MALLOC
#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */

#include "compiler.h" /* MEM_STATIC */
#define ZSTD_STATIC_LINKING_ONLY
#include "../zstd.h" /* ZSTD_customMem */

#ifndef ZSTD_ALLOCATIONS_H
#define ZSTD_ALLOCATIONS_H

/* custom memory allocation functions */

MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) {
if (customMem.customAlloc)
return customMem.customAlloc(customMem.opaque, size);
return ZSTD_malloc(size);
}

MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) {
if (customMem.customAlloc)
{
/* calloc implemented as malloc+memset;
* not as efficient as calloc, but next best guess for custom malloc */
void* const ptr = customMem.customAlloc(customMem.opaque, size);
ZSTD_memset(ptr, 0, size);
return ptr;
}
return ZSTD_calloc(1, size);
}

MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) {
if (ptr != NULL)
{
if (customMem.customFree)
customMem.customFree(customMem.opaque, ptr);
else
ZSTD_free(ptr);
}
}

#endif /* ZSTD_ALLOCATIONS_H */
227 changes: 227 additions & 0 deletions src/external/common/bits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#ifndef ZSTD_BITS_H
#define ZSTD_BITS_H

#include "mem.h"

MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) {
assert(val != 0);
{
static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20,
15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19,
16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
}
}

MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) {
assert(val != 0);
#if defined(_MSC_VER)
#if STATIC_BMI2 == 1
return (unsigned) _tzcnt_u32(val);
#else
if (val != 0)
{
unsigned long r;
_BitScanForward(&r, val);
return (unsigned) r;
}
else
{
/* Should not reach this code path */
__assume(0);
}
#endif
#elif defined(__GNUC__) && (__GNUC__ >= 4)
return (unsigned) __builtin_ctz(val);
#elif defined(__ICCARM__)
return (unsigned) __builtin_ctz(val);
#else
return ZSTD_countTrailingZeros32_fallback(val);
#endif
}

MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
assert(val != 0);
{
static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16,
18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17,
24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
val |= val >> 1;
val |= val >> 2;
val |= val >> 4;
val |= val >> 8;
val |= val >> 16;
return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
}
}

MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) {
assert(val != 0);
#if defined(_MSC_VER)
#if STATIC_BMI2 == 1
return (unsigned) _lzcnt_u32(val);
#else
if (val != 0)
{
unsigned long r;
_BitScanReverse(&r, val);
return (unsigned) (31 - r);
}
else
{
/* Should not reach this code path */
__assume(0);
}
#endif
#elif defined(__GNUC__) && (__GNUC__ >= 4)
return (unsigned) __builtin_clz(val);
#elif defined(__ICCARM__)
return (unsigned) __builtin_clz(val);
#else
return ZSTD_countLeadingZeros32_fallback(val);
#endif
}

MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) {
assert(val != 0);
#if defined(_MSC_VER) && defined(_WIN64)
#if STATIC_BMI2 == 1
return (unsigned) _tzcnt_u64(val);
#else
if (val != 0)
{
unsigned long r;
_BitScanForward64(&r, val);
return (unsigned) r;
}
else
{
/* Should not reach this code path */
__assume(0);
}
#endif
#elif defined(__GNUC__) && (__GNUC__ >= 4) && defined(__LP64__)
return (unsigned) __builtin_ctzll(val);
#elif defined(__ICCARM__)
return (unsigned) __builtin_ctzll(val);
#else
{
U32 mostSignificantWord = (U32) (val >> 32);
U32 leastSignificantWord = (U32) val;
if (leastSignificantWord == 0)
{
return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
}
else
{
return ZSTD_countTrailingZeros32(leastSignificantWord);
}
}
#endif
}

MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) {
assert(val != 0);
#if defined(_MSC_VER) && defined(_WIN64)
#if STATIC_BMI2 == 1
return (unsigned) _lzcnt_u64(val);
#else
if (val != 0)
{
unsigned long r;
_BitScanReverse64(&r, val);
return (unsigned) (63 - r);
}
else
{
/* Should not reach this code path */
__assume(0);
}
#endif
#elif defined(__GNUC__) && (__GNUC__ >= 4)
return (unsigned) (__builtin_clzll(val));
#elif defined(__ICCARM__)
return (unsigned) (__builtin_clzll(val));
#else
{
U32 mostSignificantWord = (U32) (val >> 32);
U32 leastSignificantWord = (U32) val;
if (mostSignificantWord == 0)
{
return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
}
else
{
return ZSTD_countLeadingZeros32(mostSignificantWord);
}
}
#endif
}

MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) {
if (MEM_isLittleEndian())
{
if (MEM_64bits())
{
return ZSTD_countTrailingZeros64((U64) val) >> 3;
}
else
{
return ZSTD_countTrailingZeros32((U32) val) >> 3;
}
}
else
{ /* Big Endian CPU */
if (MEM_64bits())
{
return ZSTD_countLeadingZeros64((U64) val) >> 3;
}
else
{
return ZSTD_countLeadingZeros32((U32) val) >> 3;
}
}
}

MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
{
assert(val != 0);
return 31 - ZSTD_countLeadingZeros32(val);
}

/* ZSTD_rotateRight_*():
* Rotates a bitfield to the right by "count" bits.
* https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
*/
MEM_STATIC
U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
assert(count < 64);
count &= 0x3F; /* for fickle pattern recognition */
return (value >> count) | (U64) (value << ((0U - count) & 0x3F));
}

MEM_STATIC
U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
assert(count < 32);
count &= 0x1F; /* for fickle pattern recognition */
return (value >> count) | (U32) (value << ((0U - count) & 0x1F));
}

MEM_STATIC
U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
assert(count < 16);
count &= 0x0F; /* for fickle pattern recognition */
return (value >> count) | (U16) (value << ((0U - count) & 0x0F));
}

#endif /* ZSTD_BITS_H */
Loading

0 comments on commit 36eca8d

Please sign in to comment.