Skip to content

Commit

Permalink
Working and tested solution
Browse files Browse the repository at this point in the history
  • Loading branch information
3nthusiast committed Apr 12, 2024
1 parent 4a8c634 commit a487f13
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 47 deletions.
33 changes: 31 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
Utilizing CUDA + Numba to calculate entropy.
## Utilizing CUDA + Numba to calculate entropy.

Still not faster than numpy :/
Around 10% faster than solution for single file and a lot faster for multiple files (up to 300 times faster on my equipment).

```python

from scipy.stats import entropy
import numpy as np

def entropy1(labels, base=None):
labels = np.frombuffer(labels, dtype=np.uint8)

value,counts = np.unique(labels, return_counts=True)
return entropy(counts, base=2)

```

Still in development


## Goal

Quickly calculate entropy of over 200k (110 GB) malware samples without using any CPU multiprocessing.

It took 10522.091444253922 seconds to complete the processing of all 200k malware samples (110GB).

The malware was stored on network attached storage, which has greatly impacted the I/O performance.


## Remarks

Code is not optimized and cleaned yet.
131 changes: 86 additions & 45 deletions entro.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,109 @@
import numpy as np
from numba import cuda
import math

@cuda.jit
def count_values(arr, counts):
idx = cuda.grid(1)
if idx < 256:
counts[idx] = 0
def calculate_histogram(data, hist_out):
# Initialize shared memory for local histogram
local_hist = cuda.shared.array(256, dtype=np.uint32)
tx = cuda.threadIdx.x

local_hist[tx] = 0
cuda.syncthreads()

for i in range(arr.shape[0]):
if arr[i] == idx:
cuda.atomic.add(counts, idx, 1)

def count_values_with_cuda(arr):
counts = np.zeros(256, dtype=np.int32)
threadsperblock = 256
blockspergrid = (threadsperblock + len(counts) - 1) // threadsperblock
count_values[blockspergrid, threadsperblock](arr, counts)
return counts
idx = cuda.grid(1)
stride = cuda.gridsize(1)
for i in range(idx, data.shape[0], stride):
cuda.atomic.add(local_hist, data[i], 1)
cuda.syncthreads()


cuda.atomic.add(hist_out, tx, local_hist[tx])

@cuda.jit
def calculate_histogram(data, hist_out):
# Calculate histogram using CUDA
x = cuda.grid(1)
if x < data.size:
cuda.atomic.add(hist_out, data[x], 1)

@cuda.jit
def calculate_entropy(hist, total_pixels, entropy_out):
# Calculate entropy using CUDA
x = cuda.grid(1)
if x < hist.size:
prob = hist[x] / total_pixels
idx = cuda.grid(1)
stride = cuda.gridsize(1)
for i in range(idx, hist.shape[0], stride):
prob = hist[i] / total_pixels
if prob != 0:
entropy_out[x] = -prob * math.log2(prob)
entropy_out[i] = -prob * math.log2(prob)
@cuda.jit
def sum_array(arr, result):
local_mem = cuda.shared.array(256, dtype=np.float32)

tid = cuda.threadIdx.x
bid = cuda.blockIdx.x
bdim = cuda.blockDim.x

i = bid * bdim + tid

local_mem[tid] = arr[i]
cuda.syncthreads()

s = bdim // 2
while s > 0:
if tid < s and i < arr.shape[0]:
local_mem[tid] += local_mem[tid + s]
cuda.syncthreads()
s //= 2

if tid == 0:
result[bid] = local_mem[0]

cuda.syncthreads()

def entropy_with_cuda(data):
# Convert input data to numpy array
data_np = np.array(data)

counts = count_values_with_cuda(data)
# Determine unique values and their counts
#unique_values, counts = np.unique(data_np, return_counts=True)
total_pixels = len(data)
data_gpu = cuda.to_device(np.frombuffer(data, dtype=np.uint8))

cuda.synchronize()
hist_host = np.zeros(256, dtype=np.uint32)
#hist_out = cuda.device_array(256, dtype=np.uint32)
# Initialize histogram array to zeros
#cuda.device_array_like(hist_out, fill_value=0)

hist_out = cuda.to_device(hist_host)
cuda.synchronize()
threadsperblock_hist = 256
blockspergrid_hist = min((len(data) + (threadsperblock_hist - 1)) // threadsperblock_hist, 1024)
calculate_histogram[blockspergrid_hist, threadsperblock_hist](data_gpu, hist_out)


del data_gpu
cuda.synchronize()

entropy_out_gpu = cuda.device_array(256, dtype=np.float32)


threadsperblock_entropy = 256
blockspergrid_entropy = min((hist_out.size + (threadsperblock_entropy - 1)) // threadsperblock_entropy, 1024)
calculate_entropy[blockspergrid_entropy, threadsperblock_entropy](hist_out, total_pixels, entropy_out_gpu)

cuda.synchronize()
del hist_out

result = cuda.device_array(blockspergrid_entropy, dtype=np.float32)

cuda.synchronize()

sum_array[blockspergrid_entropy, threadsperblock_entropy](entropy_out_gpu, result)


cuda.synchronize()
del entropy_out_gpu


entropy_sum = result.copy_to_host()

# Total number of pixels
total_pixels = data_np.size
del result

# Compute histogram on GPU
hist_out = np.zeros_like(range(0,255))
threadsperblock = 256
blockspergrid = (data_np.size + (threadsperblock - 1)) // threadsperblock
calculate_histogram[blockspergrid, threadsperblock](data_np, hist_out)

# Compute entropy on GPU
entropy_out = np.zeros_like(hist_out, dtype=np.float32)
threadsperblock = 256
blockspergrid = (hist_out.size + (threadsperblock - 1)) // threadsperblock
calculate_entropy[blockspergrid, threadsperblock](hist_out, total_pixels, entropy_out)
cuda.synchronize()

# Sum the entropy values to get the total entropy
entropy = np.sum(entropy_out)
return entropy_sum.sum()

return entropy
def is_supported_cuda():
return cuda.is_available() and cuda.detect()

0 comments on commit a487f13

Please sign in to comment.